Data File Description: * Sourced: Coinbase * Consists of Bitcoin prices from November 2014 - November 2020 * 2,182 Data Points: this a large enough data set to provide analysis Import Dataset
# Read CSV data into R
btc_data <- read.csv("Coinbase_BTCUSD_d.csv", header = TRUE)
#number of rows; there are 2182 data points.
dim(btc_data)
## [1] 2182 9
# Display the first 6 elements to ensure that the data is read
head(btc_data)
## Timestamp Date Symbol Open High Low Close Volume.BTC
## 1 1605830400 2020-11-20 BTCUSD 17821.58 18239.00 17764.76 18142.52 3909.44
## 2 1605744000 2020-11-19 BTCUSD 17782.91 18193.29 17356.00 17821.58 17141.49
## 3 1605657600 2020-11-18 BTCUSD 17679.36 18488.00 17205.02 17782.91 32425.64
## 4 1605571200 2020-11-17 BTCUSD 16726.64 17880.00 16575.42 17679.36 25230.04
## 5 1605484800 2020-11-16 BTCUSD 15966.89 16892.00 15879.00 16726.64 13948.06
## 6 1605398400 2020-11-15 BTCUSD 16082.01 16175.60 15796.09 15966.89 6250.08
## Volume.USD
## 1 70437003
## 2 306201498
## 3 579119955
## 4 436549314
## 5 230076772
## 6 99871183
tail(btc_data)
## Timestamp Date Symbol Open High Low Close Volume.BTC
## 2177 1417824000 2014-12-06 BTCUSD 377.1 378.0 377.10 378.0 0.01500
## 2178 1417737600 2014-12-05 BTCUSD 377.1 377.1 377.10 377.1 0.00000
## 2179 1417651200 2014-12-04 BTCUSD 378.0 378.0 377.10 377.1 0.01000
## 2180 1417564800 2014-12-03 BTCUSD 378.0 378.0 377.01 378.0 0.54660
## 2181 1417478400 2014-12-02 BTCUSD 370.0 378.0 370.00 378.0 15.01000
## 2182 1417392000 2014-12-01 BTCUSD 300.0 370.0 300.00 370.0 0.05656
## Volume.USD
## 2177 5.67
## 2178 0.00
## 2179 3.77
## 2180 206.52
## 2181 5675.07
## 2182 19.53
Order the rows by ascending order
# Order rows by date
btc = btc_data[order(btc_data$Date),]
# Display order
head(btc)
## Timestamp Date Symbol Open High Low Close Volume.BTC
## 2182 1417392000 2014-12-01 BTCUSD 300.0 370.0 300.00 370.0 0.05656
## 2181 1417478400 2014-12-02 BTCUSD 370.0 378.0 370.00 378.0 15.01000
## 2180 1417564800 2014-12-03 BTCUSD 378.0 378.0 377.01 378.0 0.54660
## 2179 1417651200 2014-12-04 BTCUSD 378.0 378.0 377.10 377.1 0.01000
## 2178 1417737600 2014-12-05 BTCUSD 377.1 377.1 377.10 377.1 0.00000
## 2177 1417824000 2014-12-06 BTCUSD 377.1 378.0 377.10 378.0 0.01500
## Volume.USD
## 2182 19.53
## 2181 5675.07
## 2180 206.52
## 2179 3.77
## 2178 0.00
## 2177 5.67
Clean the data further * Date was converted from Chr format to Date format * Symbol Column was removed
# Convert Date variable into date format
btc$Date <- as.Date(btc$Date, format = "%Y-%m-%d")
head(btc)
## Timestamp Date Symbol Open High Low Close Volume.BTC
## 2182 1417392000 2014-12-01 BTCUSD 300.0 370.0 300.00 370.0 0.05656
## 2181 1417478400 2014-12-02 BTCUSD 370.0 378.0 370.00 378.0 15.01000
## 2180 1417564800 2014-12-03 BTCUSD 378.0 378.0 377.01 378.0 0.54660
## 2179 1417651200 2014-12-04 BTCUSD 378.0 378.0 377.10 377.1 0.01000
## 2178 1417737600 2014-12-05 BTCUSD 377.1 377.1 377.10 377.1 0.00000
## 2177 1417824000 2014-12-06 BTCUSD 377.1 378.0 377.10 378.0 0.01500
## Volume.USD
## 2182 19.53
## 2181 5675.07
## 2180 206.52
## 2179 3.77
## 2178 0.00
## 2177 5.67
# Remove symbol
btc = subset(btc, select = -c(Symbol))
head(btc)
## Timestamp Date Open High Low Close Volume.BTC Volume.USD
## 2182 1417392000 2014-12-01 300.0 370.0 300.00 370.0 0.05656 19.53
## 2181 1417478400 2014-12-02 370.0 378.0 370.00 378.0 15.01000 5675.07
## 2180 1417564800 2014-12-03 378.0 378.0 377.01 378.0 0.54660 206.52
## 2179 1417651200 2014-12-04 378.0 378.0 377.10 377.1 0.01000 3.77
## 2178 1417737600 2014-12-05 377.1 377.1 377.10 377.1 0.00000 0.00
## 2177 1417824000 2014-12-06 377.1 378.0 377.10 378.0 0.01500 5.67
Close price of the next day * Closing price is an accurate representation of overall price and is slightly less volatile than daily high or low prices * We decided not to use this variable, however, to improve our model in the future this variable could be used to predict future price of bitcoin rather than merely future increase/decrease of bitcoin
#btc$Close.nextday = 0
#test_var <- btc$Close
#column_data_close_price <- 0
#for(i in 1:length(test_var)) {
#column_data_close_price[i] <- test_var[i+1]
#}
#btc$Close.nextday = column_data_close_price
#head(btc)
Create a binary close variable (H/L) * This binary variable indicated if tomorrows prices increases (H) or decreases/stayed the same (L)
btc$HL.Close = 0
test_var <- btc$Close
column_data_close_HL <- 0
for(i in 1:length(test_var)) {
if(isTRUE(test_var[i] > test_var[i+1])) {
column_data_close_HL[i] <- 0 #L
}
else if(isTRUE(test_var[i] == test_var[i+1])) {
column_data_close_HL[i] <- 0 #L
}
else{
column_data_close_HL[i] <- 1 #H
}
}
btc$HL.Close = column_data_close_HL
p <- btc$HL.Close
head(btc)
## Timestamp Date Open High Low Close Volume.BTC Volume.USD
## 2182 1417392000 2014-12-01 300.0 370.0 300.00 370.0 0.05656 19.53
## 2181 1417478400 2014-12-02 370.0 378.0 370.00 378.0 15.01000 5675.07
## 2180 1417564800 2014-12-03 378.0 378.0 377.01 378.0 0.54660 206.52
## 2179 1417651200 2014-12-04 378.0 378.0 377.10 377.1 0.01000 3.77
## 2178 1417737600 2014-12-05 377.1 377.1 377.10 377.1 0.00000 0.00
## 2177 1417824000 2014-12-06 377.1 378.0 377.10 378.0 0.01500 5.67
## HL.Close
## 2182 1
## 2181 0
## 2180 0
## 2179 0
## 2178 1
## 2177 0
##Cyptocurrencies Section
Data Description * Ethereum Data prices from May 27 2016- November 11 2020 * Source: Coinbase
# Read CSV data into R
eth_data <- read.csv("Coinbase_ETHUSD_d.csv", header = TRUE)
# Order rows by date
eth = eth_data[order(eth_data$Date),]
# Remove Timestamp & Symbol
eth = subset(eth, select = -c(Unix.Timestamp, Symbol))
# Convert Date factor in date format
eth$Date <- as.Date(eth$Date, format = "%Y-%m-%d")
# Remove the Open, High and Low variables
eth <- subset(eth, select = -c(Open, High, Low, Volume.ETH))
# Rename Close variable to Price
names(eth)[names(eth) == "Close"] <- "ETH.Price"
# Rename Volume.USD to Currency
names(eth)[names(eth) == "Volume.USD"] <- "ETH.Volume"
# Display the first and last 6 elements to ensure that the data is read properly
head(eth)
## Date ETH.Price ETH.Volume
## 1639 2016-05-27 11.25 151147.98
## 1638 2016-05-28 11.93 180822.02
## 1637 2016-05-29 12.34 42228.37
## 1636 2016-05-30 12.41 51655.95
## 1635 2016-05-31 14.00 76994.75
## 1634 2016-06-01 13.93 145746.12
tail(eth)
## Date ETH.Price ETH.Volume
## 6 2020-11-15 448.58 40271351
## 5 2020-11-16 460.85 51758620
## 4 2020-11-17 482.68 93082972
## 3 2020-11-18 478.96 141725015
## 2 2020-11-19 471.92 62514644
## 1 2020-11-20 484.88 29558467
Data Description * Litecoin Data prices from August 23 2016- November 15 2020 * Source: Coinbase
# Read CSV data into R
ltc_data <- read.csv("Coinbase_LTCUSD_d.csv", header = TRUE)
# Order rows by date
ltc = ltc_data[order(ltc_data$Date),]
# Remove Timestamp & Symbol
ltc = subset(ltc, select = -c(Unix.Timestamp, Symbol))
# Convert Date factor into date format
ltc$Date <- as.Date(ltc$Date, format = "%Y-%m-%d")
# Remove the Open, High and Low variables
ltc <- subset(ltc, select = -c(Open, High, Low, Volume.LTC))
# Rename Close variable to Price
names(ltc)[names(ltc) == "Close"] <- "LTC.Price"
# Rename Volume.USD to Currency
names(ltc)[names(ltc) == "Volume.USD"] <- "LTC.Volume"
# Display the first and last 6 elements to ensure that the data is read properly
head(ltc)
## Date LTC.Price LTC.Volume
## 1551 2016-08-23 3.95 1737.31
## 1550 2016-08-24 3.84 19247.53
## 1549 2016-08-25 3.81 19276.60
## 1548 2016-08-26 3.81 12746.27
## 1547 2016-08-27 3.78 4295.72
## 1546 2016-08-28 3.72 7111.87
tail(ltc)
## Date LTC.Price LTC.Volume
## 6 2020-11-15 62.37 8286862
## 5 2020-11-16 73.83 43992549
## 4 2020-11-17 76.41 45138596
## 3 2020-11-18 73.48 48863408
## 2 2020-11-19 81.64 63647825
## 1 2020-11-20 81.22 13261137
Data Description * Ripple Data prices from January 17 2017- November 15 2020 * Source: Coinbase
# Read CSV data into R
xrp_data <- read.csv("Bitstamp_XRPUSD_d.csv", header = TRUE)
# Order rows by date
xrp = xrp_data[order(xrp_data$Date),]
# Remove Timestamp & Symbol
xrp = subset(xrp, select = -c(Unix.Timestamp, Symbol))
# Convert Date factor into date format
xrp$Date <- as.Date(xrp$Date, format = "%Y-%m-%d")
# Remove the Open, High and Low variables
xrp <- subset(xrp, select = -c(Open, High, Low, Volume.XRP))
# Rename Close variable to Price
names(xrp)[names(xrp) == "Close"] <- "XRP.Price"
# Rename Volume.USD to Currency
names(xrp)[names(xrp) == "Volume.USD"] <- "XRP.Volume"
# Display the first and last 6 elements to ensure that the data is read properly
head(xrp)
## Date XRP.Price XRP.Volume
## 1404 2017-01-17 0.00683 30673.69
## 1403 2017-01-18 0.00680 38018.93
## 1402 2017-01-19 0.00684 19882.33
## 1401 2017-01-20 0.00660 11374.15
## 1400 2017-01-21 0.00684 13955.92
## 1399 2017-01-22 0.00678 3878.76
tail(xrp)
## Date XRP.Price XRP.Volume
## 6 2020-11-15 0.2697 14292147
## 5 2020-11-16 0.2880 23107272
## 4 2020-11-17 0.3026 35535529
## 3 2020-11-18 0.2937 40662719
## 2 2020-11-19 0.3044 34206680
## 1 2020-11-20 0.3005 8675902
# Count the number of rows since this is the shortest data time frame
dim(xrp)[1]
## [1] 1404
Filter Rows for Consistency * Due to difference in start of the crypto data points and the BTC data points, rows will need to be removed to be aligned + This will weaken the model as it is removing 789 data points
# Total number of rows (from XRP database since it has the fewest historical data points)
c_rows <- dim(xrp)[1] -12 #TBD : why the 12?
# Format Bitcoin
btc_c <- tail(btc,n=c_rows)
head(btc_c)
## Timestamp Date Open High Low Close Volume.BTC Volume.USD
## 1392 1485648000 2017-01-29 924.70 927.47 915.00 917.31 2498.61 2303176
## 1391 1485734400 2017-01-30 917.31 923.95 914.69 923.45 3678.36 3385239
## 1390 1485820800 2017-01-31 923.45 971.24 922.83 970.92 6624.94 6298154
## 1389 1485907200 2017-02-01 970.92 991.38 963.84 989.71 5983.96 5835317
## 1388 1485993600 2017-02-02 989.71 1010.00 978.74 1007.66 5623.69 5602317
## 1387 1486080000 2017-02-03 1007.66 1024.50 994.34 1016.77 6731.61 6815466
## HL.Close
## 1392 1
## 1391 1
## 1390 1
## 1389 1
## 1388 1
## 1387 1
tail(btc_c)
## Timestamp Date Open High Low Close Volume.BTC
## 6 1605398400 2020-11-15 16082.01 16175.60 15796.09 15966.89 6250.08
## 5 1605484800 2020-11-16 15966.89 16892.00 15879.00 16726.64 13948.06
## 4 1605571200 2020-11-17 16726.64 17880.00 16575.42 17679.36 25230.04
## 3 1605657600 2020-11-18 17679.36 18488.00 17205.02 17782.91 32425.64
## 2 1605744000 2020-11-19 17782.91 18193.29 17356.00 17821.58 17141.49
## 1 1605830400 2020-11-20 17821.58 18239.00 17764.76 18142.52 3909.44
## Volume.USD HL.Close
## 6 99871183 1
## 5 230076772 1
## 4 436549314 1
## 3 579119955 1
## 2 306201498 1
## 1 70437003 1
# Ethereum
eth <- tail(eth,n=c_rows)
head(eth)
## Date ETH.Price ETH.Volume
## 1392 2017-01-29 10.50 189086.3
## 1391 2017-01-30 10.59 437746.5
## 1390 2017-01-31 10.74 413350.2
## 1389 2017-02-01 10.73 630953.7
## 1388 2017-02-02 10.82 513774.8
## 1387 2017-02-03 10.95 531755.4
tail(eth)
## Date ETH.Price ETH.Volume
## 6 2020-11-15 448.58 40271351
## 5 2020-11-16 460.85 51758620
## 4 2020-11-17 482.68 93082972
## 3 2020-11-18 478.96 141725015
## 2 2020-11-19 471.92 62514644
## 1 2020-11-20 484.88 29558467
# Litecoin
ltc <- tail(ltc,n=c_rows)
head(ltc)
## Date LTC.Price LTC.Volume
## 1392 2017-01-29 3.88 2151.59
## 1391 2017-01-30 4.03 23569.63
## 1390 2017-01-31 4.07 35332.93
## 1389 2017-02-01 4.08 17621.75
## 1388 2017-02-02 4.09 17753.63
## 1387 2017-02-03 4.06 15202.71
tail(ltc)
## Date LTC.Price LTC.Volume
## 6 2020-11-15 62.37 8286862
## 5 2020-11-16 73.83 43992549
## 4 2020-11-17 76.41 45138596
## 3 2020-11-18 73.48 48863408
## 2 2020-11-19 81.64 63647825
## 1 2020-11-20 81.22 13261137
# Ripple
xrp <- tail(xrp,n=c_rows)
head(xrp)
## Date XRP.Price XRP.Volume
## 1392 2017-01-29 0.00631 380.92
## 1391 2017-01-30 0.00645 3249.53
## 1390 2017-01-31 0.00641 13926.48
## 1389 2017-02-01 0.00649 13118.79
## 1388 2017-02-02 0.00640 13887.87
## 1387 2017-02-03 0.00638 12139.60
tail(xrp)
## Date XRP.Price XRP.Volume
## 6 2020-11-15 0.2697 14292147
## 5 2020-11-16 0.2880 23107272
## 4 2020-11-17 0.3026 35535529
## 3 2020-11-18 0.2937 40662719
## 2 2020-11-19 0.3044 34206680
## 1 2020-11-20 0.3005 8675902
Remove All but One Date Variable & Merge Datasets * Now the master data file starts on 2017 January 29 - 2020 November 20
# Ethereum
eth <- subset(eth, select = -c(Date))
# Litecoin
ltc <- subset(ltc, select = -c(Date))
# Ripple
xrp <- subset(xrp, select = -c(Date))
# Merge the Data Frames
coins <- cbind(btc_c, eth, ltc, xrp)
head(coins)
## Timestamp Date Open High Low Close Volume.BTC Volume.USD
## 1392 1485648000 2017-01-29 924.70 927.47 915.00 917.31 2498.61 2303176
## 1391 1485734400 2017-01-30 917.31 923.95 914.69 923.45 3678.36 3385239
## 1390 1485820800 2017-01-31 923.45 971.24 922.83 970.92 6624.94 6298154
## 1389 1485907200 2017-02-01 970.92 991.38 963.84 989.71 5983.96 5835317
## 1388 1485993600 2017-02-02 989.71 1010.00 978.74 1007.66 5623.69 5602317
## 1387 1486080000 2017-02-03 1007.66 1024.50 994.34 1016.77 6731.61 6815466
## HL.Close ETH.Price ETH.Volume LTC.Price LTC.Volume XRP.Price XRP.Volume
## 1392 1 10.50 189086.3 3.88 2151.59 0.00631 380.92
## 1391 1 10.59 437746.5 4.03 23569.63 0.00645 3249.53
## 1390 1 10.74 413350.2 4.07 35332.93 0.00641 13926.48
## 1389 1 10.73 630953.7 4.08 17621.75 0.00649 13118.79
## 1388 1 10.82 513774.8 4.09 17753.63 0.00640 13887.87
## 1387 1 10.95 531755.4 4.06 15202.71 0.00638 12139.60
# Rename frame for consistency
master = coins
head(master)
## Timestamp Date Open High Low Close Volume.BTC Volume.USD
## 1392 1485648000 2017-01-29 924.70 927.47 915.00 917.31 2498.61 2303176
## 1391 1485734400 2017-01-30 917.31 923.95 914.69 923.45 3678.36 3385239
## 1390 1485820800 2017-01-31 923.45 971.24 922.83 970.92 6624.94 6298154
## 1389 1485907200 2017-02-01 970.92 991.38 963.84 989.71 5983.96 5835317
## 1388 1485993600 2017-02-02 989.71 1010.00 978.74 1007.66 5623.69 5602317
## 1387 1486080000 2017-02-03 1007.66 1024.50 994.34 1016.77 6731.61 6815466
## HL.Close ETH.Price ETH.Volume LTC.Price LTC.Volume XRP.Price XRP.Volume
## 1392 1 10.50 189086.3 3.88 2151.59 0.00631 380.92
## 1391 1 10.59 437746.5 4.03 23569.63 0.00645 3249.53
## 1390 1 10.74 413350.2 4.07 35332.93 0.00641 13926.48
## 1389 1 10.73 630953.7 4.08 17621.75 0.00649 13118.79
## 1388 1 10.82 513774.8 4.09 17753.63 0.00640 13887.87
## 1387 1 10.95 531755.4 4.06 15202.71 0.00638 12139.60
tail(master)
## Timestamp Date Open High Low Close Volume.BTC
## 6 1605398400 2020-11-15 16082.01 16175.60 15796.09 15966.89 6250.08
## 5 1605484800 2020-11-16 15966.89 16892.00 15879.00 16726.64 13948.06
## 4 1605571200 2020-11-17 16726.64 17880.00 16575.42 17679.36 25230.04
## 3 1605657600 2020-11-18 17679.36 18488.00 17205.02 17782.91 32425.64
## 2 1605744000 2020-11-19 17782.91 18193.29 17356.00 17821.58 17141.49
## 1 1605830400 2020-11-20 17821.58 18239.00 17764.76 18142.52 3909.44
## Volume.USD HL.Close ETH.Price ETH.Volume LTC.Price LTC.Volume XRP.Price
## 6 99871183 1 448.58 40271351 62.37 8286862 0.2697
## 5 230076772 1 460.85 51758620 73.83 43992549 0.2880
## 4 436549314 1 482.68 93082972 76.41 45138596 0.3026
## 3 579119955 1 478.96 141725015 73.48 48863408 0.2937
## 2 306201498 1 471.92 62514644 81.64 63647825 0.3044
## 1 70437003 1 484.88 29558467 81.22 13261137 0.3005
## XRP.Volume
## 6 14292147
## 5 23107272
## 4 35535529
## 3 40662719
## 2 34206680
## 1 8675902
dim(master )
## [1] 1392 15
Data Description * Interest over time of Bitcoin (Jan 2017 –> Nov 2020) * Source: Google Trends * Numbers represent search interest relative to the highest point on the chart for the given region and time. A value of 100 is the peak popularity for the term. A value of 50 means that the term is half as popular. A score of 0 means there was not enough data for this term
Process Explanation * We were unable to find daily Bitcoin interest data for a long time frame. However, we were able to find weekly data * There are two options to align with the master data frame: + (1) Convert the master BTC data into weekly + (2) Convert the google search weekly data into daily * Our group chose to convert google search data into daily as converting the BTC data into weekly will significantly decrease our data size (~ 1300 –> ~ 300). * Instead we will iterate through the google search data and for the weeks data point, we will assign it the dates “google trends data_v3.csv” represents the weekly data converted to daily The data clean to convert to daily was done on excel (using VLOOKUP and Match and Index)
# Sourced from https://trends.google.com/trends/explore?date=2014-12-01%202020-11-20&geo=US&q=Bitcoin
# Read CSV data into R
search <- read.csv("google trends data_v3.csv", header = TRUE)
search = search[order(search$Date),]
# Confirm correct number of rows
dim(search)
## [1] 1392 2
# Rename Bitcoin Frequency variable
names(search)[names(search) == "Bitcoin"] = "Google_Search_Frequency"
# Convert search freqency into numerical format
search$Google_Search_Frequency = as.numeric(search$Google_Search_Frequency)
head(search)
## Date Google_Search_Frequency
## 1 2017-01-29 4
## 2 2017-01-30 4
## 3 2017-01-31 4
## 4 2017-02-01 4
## 5 2017-02-02 4
## 6 2017-02-03 4
tail(search)
## Date Google_Search_Frequency
## 1387 2020-11-15 22
## 1388 2020-11-16 22
## 1389 2020-11-17 22
## 1390 2020-11-18 22
## 1391 2020-11-19 22
## 1392 2020-11-20 22
Merge the search data to master data frame
master = cbind(master, search$Google_Search_Frequency)
head(master)
## Timestamp Date Open High Low Close Volume.BTC Volume.USD
## 1392 1485648000 2017-01-29 924.70 927.47 915.00 917.31 2498.61 2303176
## 1391 1485734400 2017-01-30 917.31 923.95 914.69 923.45 3678.36 3385239
## 1390 1485820800 2017-01-31 923.45 971.24 922.83 970.92 6624.94 6298154
## 1389 1485907200 2017-02-01 970.92 991.38 963.84 989.71 5983.96 5835317
## 1388 1485993600 2017-02-02 989.71 1010.00 978.74 1007.66 5623.69 5602317
## 1387 1486080000 2017-02-03 1007.66 1024.50 994.34 1016.77 6731.61 6815466
## HL.Close ETH.Price ETH.Volume LTC.Price LTC.Volume XRP.Price XRP.Volume
## 1392 1 10.50 189086.3 3.88 2151.59 0.00631 380.92
## 1391 1 10.59 437746.5 4.03 23569.63 0.00645 3249.53
## 1390 1 10.74 413350.2 4.07 35332.93 0.00641 13926.48
## 1389 1 10.73 630953.7 4.08 17621.75 0.00649 13118.79
## 1388 1 10.82 513774.8 4.09 17753.63 0.00640 13887.87
## 1387 1 10.95 531755.4 4.06 15202.71 0.00638 12139.60
## search$Google_Search_Frequency
## 1392 4
## 1391 4
## 1390 4
## 1389 4
## 1388 4
## 1387 4
Data Description: * Price of SP 500 from 2017-2020 * Source: Yahoo Finance * Since the stock market is only open on the weekdays, the data file does not consist of weekend values * We will remove weekends on the master file to ensure alignment of asset classes data * This will weaken the model as it reduce the data frame size by ~ 104 data points
SP500 <- read.csv("^GSPC.csv", header = TRUE)
SP500 = SP500[order(SP500$Date),]
SP500$Date <- as.Date(SP500$Date, format = "%Y-%m-%d")
# Remove the Open, High and Low variables
SP500 <- subset(SP500, select = -c(Open, High, Low, Adj.Close))
# Rename Close variable to Price
names(SP500)[names(SP500) == "Close"] <- "SP500.Price"
# Rename Volume.USD to Currency
names(SP500)[names(SP500) == "Volume"] <- "SP500.Volume"
head(SP500)
## Date SP500.Price SP500.Volume
## 1 2017-01-30 2280.90 3591270000
## 2 2017-01-31 2278.87 4087450000
## 3 2017-02-01 2279.55 3916610000
## 4 2017-02-02 2280.85 3807710000
## 5 2017-02-03 2297.42 3597970000
## 6 2017-02-06 2292.56 3109050000
tail(SP500)
## Date SP500.Price SP500.Volume
## 956 2020-11-12 3537.01 4890120000
## 957 2020-11-13 3585.15 4709670000
## 958 2020-11-16 3626.91 5281980000
## 959 2020-11-17 3609.53 4799570000
## 960 2020-11-18 3567.79 5274450000
## 961 2020-11-19 3581.87 4347200000
dim(SP500)
## [1] 961 3
Convert master data frame to only include week days * Search through the master file, find S and P 500 date, and insert the S and P 500 Price and Volume for the associated date * This is an inefficient method to search, to improve this potentially using a linear search as the data is already sorted
master$SP500.Price = 0
master$SP500.Volume =0
head(master)
## Timestamp Date Open High Low Close Volume.BTC Volume.USD
## 1392 1485648000 2017-01-29 924.70 927.47 915.00 917.31 2498.61 2303176
## 1391 1485734400 2017-01-30 917.31 923.95 914.69 923.45 3678.36 3385239
## 1390 1485820800 2017-01-31 923.45 971.24 922.83 970.92 6624.94 6298154
## 1389 1485907200 2017-02-01 970.92 991.38 963.84 989.71 5983.96 5835317
## 1388 1485993600 2017-02-02 989.71 1010.00 978.74 1007.66 5623.69 5602317
## 1387 1486080000 2017-02-03 1007.66 1024.50 994.34 1016.77 6731.61 6815466
## HL.Close ETH.Price ETH.Volume LTC.Price LTC.Volume XRP.Price XRP.Volume
## 1392 1 10.50 189086.3 3.88 2151.59 0.00631 380.92
## 1391 1 10.59 437746.5 4.03 23569.63 0.00645 3249.53
## 1390 1 10.74 413350.2 4.07 35332.93 0.00641 13926.48
## 1389 1 10.73 630953.7 4.08 17621.75 0.00649 13118.79
## 1388 1 10.82 513774.8 4.09 17753.63 0.00640 13887.87
## 1387 1 10.95 531755.4 4.06 15202.71 0.00638 12139.60
## search$Google_Search_Frequency SP500.Price SP500.Volume
## 1392 4 0 0
## 1391 4 0 0
## 1390 4 0 0
## 1389 4 0 0
## 1388 4 0 0
## 1387 4 0 0
for (master_date in (1:length(master$Date)))
{
for (SP_date in (1:length(SP500$Date)))
{
if(master$Date[master_date] == SP500$Date[SP_date])
{
master$SP500.Price[master_date] = SP500$SP500.Price[SP_date]
master$SP500.Volume[master_date] = SP500$SP500.Volume[SP_date]
next
}
}
}
head(master)
## Timestamp Date Open High Low Close Volume.BTC Volume.USD
## 1392 1485648000 2017-01-29 924.70 927.47 915.00 917.31 2498.61 2303176
## 1391 1485734400 2017-01-30 917.31 923.95 914.69 923.45 3678.36 3385239
## 1390 1485820800 2017-01-31 923.45 971.24 922.83 970.92 6624.94 6298154
## 1389 1485907200 2017-02-01 970.92 991.38 963.84 989.71 5983.96 5835317
## 1388 1485993600 2017-02-02 989.71 1010.00 978.74 1007.66 5623.69 5602317
## 1387 1486080000 2017-02-03 1007.66 1024.50 994.34 1016.77 6731.61 6815466
## HL.Close ETH.Price ETH.Volume LTC.Price LTC.Volume XRP.Price XRP.Volume
## 1392 1 10.50 189086.3 3.88 2151.59 0.00631 380.92
## 1391 1 10.59 437746.5 4.03 23569.63 0.00645 3249.53
## 1390 1 10.74 413350.2 4.07 35332.93 0.00641 13926.48
## 1389 1 10.73 630953.7 4.08 17621.75 0.00649 13118.79
## 1388 1 10.82 513774.8 4.09 17753.63 0.00640 13887.87
## 1387 1 10.95 531755.4 4.06 15202.71 0.00638 12139.60
## search$Google_Search_Frequency SP500.Price SP500.Volume
## 1392 4 0.00 0
## 1391 4 2280.90 3591270000
## 1390 4 2278.87 4087450000
## 1389 4 2279.55 3916610000
## 1388 4 2280.85 3807710000
## 1387 4 2297.42 3597970000
tail(master)
## Timestamp Date Open High Low Close Volume.BTC
## 6 1605398400 2020-11-15 16082.01 16175.60 15796.09 15966.89 6250.08
## 5 1605484800 2020-11-16 15966.89 16892.00 15879.00 16726.64 13948.06
## 4 1605571200 2020-11-17 16726.64 17880.00 16575.42 17679.36 25230.04
## 3 1605657600 2020-11-18 17679.36 18488.00 17205.02 17782.91 32425.64
## 2 1605744000 2020-11-19 17782.91 18193.29 17356.00 17821.58 17141.49
## 1 1605830400 2020-11-20 17821.58 18239.00 17764.76 18142.52 3909.44
## Volume.USD HL.Close ETH.Price ETH.Volume LTC.Price LTC.Volume XRP.Price
## 6 99871183 1 448.58 40271351 62.37 8286862 0.2697
## 5 230076772 1 460.85 51758620 73.83 43992549 0.2880
## 4 436549314 1 482.68 93082972 76.41 45138596 0.3026
## 3 579119955 1 478.96 141725015 73.48 48863408 0.2937
## 2 306201498 1 471.92 62514644 81.64 63647825 0.3044
## 1 70437003 1 484.88 29558467 81.22 13261137 0.3005
## XRP.Volume search$Google_Search_Frequency SP500.Price SP500.Volume
## 6 14292147 22 0.00 0
## 5 23107272 22 3626.91 5281980000
## 4 35535529 22 3609.53 4799570000
## 3 40662719 22 3567.79 5274450000
## 2 34206680 22 3581.87 4347200000
## 1 8675902 22 0.00 0
Removing Weekends/Holidays * Remove any values the SP 500 didn’t have * now only has 961 rows of data
master= master[!(master$SP500.Price ==0 & master$SP500.Volume ==0),]
head(master)
## Timestamp Date Open High Low Close Volume.BTC
## 1391 1485734400 2017-01-30 917.31 923.95 914.69 923.45 3678.36
## 1390 1485820800 2017-01-31 923.45 971.24 922.83 970.92 6624.94
## 1389 1485907200 2017-02-01 970.92 991.38 963.84 989.71 5983.96
## 1388 1485993600 2017-02-02 989.71 1010.00 978.74 1007.66 5623.69
## 1387 1486080000 2017-02-03 1007.66 1024.50 994.34 1016.77 6731.61
## 1384 1486339200 2017-02-06 1019.31 1027.70 1014.64 1024.39 4227.33
## Volume.USD HL.Close ETH.Price ETH.Volume LTC.Price LTC.Volume XRP.Price
## 1391 3385239 1 10.59 437746.5 4.03 23569.63 0.00645
## 1390 6298154 1 10.74 413350.2 4.07 35332.93 0.00641
## 1389 5835317 1 10.73 630953.7 4.08 17621.75 0.00649
## 1388 5602317 1 10.82 513774.8 4.09 17753.63 0.00640
## 1387 6815466 1 10.95 531755.4 4.06 15202.71 0.00638
## 1384 4321741 1 11.34 509447.5 3.97 6224.56 0.00638
## XRP.Volume search$Google_Search_Frequency SP500.Price SP500.Volume
## 1391 3249.53 4 2280.90 3591270000
## 1390 13926.48 4 2278.87 4087450000
## 1389 13118.79 4 2279.55 3916610000
## 1388 13887.87 4 2280.85 3807710000
## 1387 12139.60 4 2297.42 3597970000
## 1384 2697.23 4 2292.56 3109050000
tail(master)
## Timestamp Date Open High Low Close Volume.BTC
## 9 1605139200 2020-11-12 15705.79 16370.89 15446.82 16310.81 22153.74
## 8 1605225600 2020-11-13 16310.81 16491.92 15975.00 16339.56 14593.52
## 5 1605484800 2020-11-16 15966.89 16892.00 15879.00 16726.64 13948.06
## 4 1605571200 2020-11-17 16726.64 17880.00 16575.42 17679.36 25230.04
## 3 1605657600 2020-11-18 17679.36 18488.00 17205.02 17782.91 32425.64
## 2 1605744000 2020-11-19 17782.91 18193.29 17356.00 17821.58 17141.49
## Volume.USD HL.Close ETH.Price ETH.Volume LTC.Price LTC.Volume XRP.Price
## 9 353634687 1 462.98 83026488 60.74 11126850 0.2551
## 8 237577678 0 477.14 63134324 66.01 30783340 0.2660
## 5 230076772 1 460.85 51758620 73.83 43992549 0.2880
## 4 436549314 1 482.68 93082972 76.41 45138596 0.3026
## 3 579119955 1 478.96 141725015 73.48 48863408 0.2937
## 2 306201498 1 471.92 62514644 81.64 63647825 0.3044
## XRP.Volume search$Google_Search_Frequency SP500.Price SP500.Volume
## 9 16698901 15 3537.01 4890120000
## 8 18763922 15 3585.15 4709670000
## 5 23107272 22 3626.91 5281980000
## 4 35535529 22 3609.53 4799570000
## 3 40662719 22 3567.79 5274450000
## 2 34206680 22 3581.87 4347200000
dim(master)
## [1] 961 18
Data Description * Price of gold from 2017- 2020 * Source: Yahoo Finance
gold <- read.csv("GC=F.csv", header = TRUE)
gold = gold[order(gold$Date),]
gold$Date <- as.Date(gold$Date, format = "%Y-%m-%d")
# Remove the Open, High and Low variables
gold <- subset(gold, select = -c(Open, High, Low, Adj.Close))
# Rename Close variable to Price
names(gold)[names(gold) == "Close"] <- "Gold.Price"
# Rename Volume.USD to Currency
names(gold)[names(gold) == "Volume"] <- "Gold.Volume"
#Remove Nulls
gold= gold[!(gold$Gold.Price == "null" & gold$Gold.Volume =="null"),]
# Convert price and volume factor into numerical format
gold$Gold.Price = as.numeric(gold$Gold.Price)
gold$Gold.Volume = as.numeric(gold$Gold.Volume)
head(gold)
## Date Gold.Price Gold.Volume
## 1 2017-01-30 1193.2 50503
## 2 2017-01-31 1208.6 3212
## 3 2017-02-01 1205.6 1145
## 4 2017-02-02 1216.7 1512
## 5 2017-02-03 1218.5 865
## 7 2017-02-06 1230.0 908
tail(gold)
## Date Gold.Price Gold.Volume
## 1153 2020-11-12 1872.6 220
## 1154 2020-11-13 1885.7 240
## 1156 2020-11-16 1887.3 6
## 1157 2020-11-17 1884.5 59
## 1158 2020-11-18 1873.5 152
## 1159 2020-11-19 1861.1 59
dim(gold)
## [1] 1036 3
Aligning with Master Data Set
master$Gold.Price = 0
master$Gold.Volume = 0
for (master_date in (1:length(master$Date)))
{
for (i in (1:length(gold$Date)))
{
if(master$Date[master_date] == gold$Date[i])
{
master$Gold.Price[master_date] = gold$Gold.Price[i]
master$Gold.Volume[master_date] = gold$Gold.Volume[i]
next
}
}
}
head(master)
## Timestamp Date Open High Low Close Volume.BTC
## 1391 1485734400 2017-01-30 917.31 923.95 914.69 923.45 3678.36
## 1390 1485820800 2017-01-31 923.45 971.24 922.83 970.92 6624.94
## 1389 1485907200 2017-02-01 970.92 991.38 963.84 989.71 5983.96
## 1388 1485993600 2017-02-02 989.71 1010.00 978.74 1007.66 5623.69
## 1387 1486080000 2017-02-03 1007.66 1024.50 994.34 1016.77 6731.61
## 1384 1486339200 2017-02-06 1019.31 1027.70 1014.64 1024.39 4227.33
## Volume.USD HL.Close ETH.Price ETH.Volume LTC.Price LTC.Volume XRP.Price
## 1391 3385239 1 10.59 437746.5 4.03 23569.63 0.00645
## 1390 6298154 1 10.74 413350.2 4.07 35332.93 0.00641
## 1389 5835317 1 10.73 630953.7 4.08 17621.75 0.00649
## 1388 5602317 1 10.82 513774.8 4.09 17753.63 0.00640
## 1387 6815466 1 10.95 531755.4 4.06 15202.71 0.00638
## 1384 4321741 1 11.34 509447.5 3.97 6224.56 0.00638
## XRP.Volume search$Google_Search_Frequency SP500.Price SP500.Volume
## 1391 3249.53 4 2280.90 3591270000
## 1390 13926.48 4 2278.87 4087450000
## 1389 13118.79 4 2279.55 3916610000
## 1388 13887.87 4 2280.85 3807710000
## 1387 12139.60 4 2297.42 3597970000
## 1384 2697.23 4 2292.56 3109050000
## Gold.Price Gold.Volume
## 1391 1193.2 50503
## 1390 1208.6 3212
## 1389 1205.6 1145
## 1388 1216.7 1512
## 1387 1218.5 865
## 1384 1230.0 908
tail(master)
## Timestamp Date Open High Low Close Volume.BTC
## 9 1605139200 2020-11-12 15705.79 16370.89 15446.82 16310.81 22153.74
## 8 1605225600 2020-11-13 16310.81 16491.92 15975.00 16339.56 14593.52
## 5 1605484800 2020-11-16 15966.89 16892.00 15879.00 16726.64 13948.06
## 4 1605571200 2020-11-17 16726.64 17880.00 16575.42 17679.36 25230.04
## 3 1605657600 2020-11-18 17679.36 18488.00 17205.02 17782.91 32425.64
## 2 1605744000 2020-11-19 17782.91 18193.29 17356.00 17821.58 17141.49
## Volume.USD HL.Close ETH.Price ETH.Volume LTC.Price LTC.Volume XRP.Price
## 9 353634687 1 462.98 83026488 60.74 11126850 0.2551
## 8 237577678 0 477.14 63134324 66.01 30783340 0.2660
## 5 230076772 1 460.85 51758620 73.83 43992549 0.2880
## 4 436549314 1 482.68 93082972 76.41 45138596 0.3026
## 3 579119955 1 478.96 141725015 73.48 48863408 0.2937
## 2 306201498 1 471.92 62514644 81.64 63647825 0.3044
## XRP.Volume search$Google_Search_Frequency SP500.Price SP500.Volume Gold.Price
## 9 16698901 15 3537.01 4890120000 1872.6
## 8 18763922 15 3585.15 4709670000 1885.7
## 5 23107272 22 3626.91 5281980000 1887.3
## 4 35535529 22 3609.53 4799570000 1884.5
## 3 40662719 22 3567.79 5274450000 1873.5
## 2 34206680 22 3581.87 4347200000 1861.1
## Gold.Volume
## 9 220
## 8 240
## 5 6
## 4 59
## 3 152
## 2 59
The data now has 952 data points to work with and 21 features * 952 data points are very few considering the complexity of this task * 21 features provide a sufficient number of options for creating a predictive outcomes * It’ll be difficult to build a strong predictive model given the small data set, randomness of the data, and relatively simple machine learning models.
#Removing blanks
master= master[!(master$Gold.Price ==0 & master$Gold.Volume ==0),]
dim(master)
## [1] 952 20
summary(master)
## Timestamp Date Open High
## Min. :1.486e+09 Min. :2017-01-30 Min. : 917.3 Min. : 924
## 1st Qu.:1.516e+09 1st Qu.:2018-01-10 1st Qu.: 4208.0 1st Qu.: 4361
## Median :1.546e+09 Median :2018-12-27 Median : 7357.7 Median : 7561
## Mean :1.546e+09 Mean :2018-12-26 Mean : 7222.7 Mean : 7443
## 3rd Qu.:1.576e+09 3rd Qu.:2019-12-10 3rd Qu.: 9527.5 3rd Qu.: 9733
## Max. :1.606e+09 Max. :2020-11-19 Max. :19379.0 Max. :19651
## Low Close Volume.BTC Volume.USD
## Min. : 914.7 Min. : 923.5 Min. : 0 Min. :0.000e+00
## 1st Qu.: 4047.4 1st Qu.: 4213.8 1st Qu.: 8114 1st Qu.:4.264e+07
## Median : 7127.2 Median : 7350.8 Median : 12052 Median :8.201e+07
## Mean : 6974.9 Mean : 7239.9 Mean : 15036 Mean :1.167e+08
## 3rd Qu.: 9229.2 3rd Qu.: 9523.4 3rd Qu.: 17937 3rd Qu.:1.427e+08
## Max. :18200.0 Max. :19039.0 Max. :117495 Max. :1.238e+09
## HL.Close ETH.Price ETH.Volume LTC.Price
## Min. :0.0000 Min. : 10.59 Min. : 0 Min. : 3.76
## 1st Qu.:0.0000 1st Qu.: 164.73 1st Qu.: 15376424 1st Qu.: 43.58
## Median :1.0000 Median : 227.47 Median : 31965666 Median : 55.84
## Mean :0.5483 Mean : 292.97 Mean : 50464237 Mean : 70.43
## 3rd Qu.:1.0000 3rd Qu.: 368.11 3rd Qu.: 58655071 3rd Qu.: 79.82
## Max. :1.0000 Max. :1290.01 Max. :736027536 Max. :359.40
## LTC.Volume XRP.Price XRP.Volume
## Min. :0.000e+00 Min. :0.00539 Min. : 2697
## 1st Qu.:5.991e+06 1st Qu.:0.20945 1st Qu.: 4118799
## Median :1.195e+07 Median :0.27870 Median : 8967614
## Mean :2.805e+07 Mean :0.35137 Mean : 15765896
## 3rd Qu.:2.461e+07 3rd Qu.:0.38880 3rd Qu.: 16934925
## Max. :1.082e+09 Max. :2.73000 Max. :335937893
## search$Google_Search_Frequency SP500.Price SP500.Volume
## Min. : 4.00 Min. :2237 Min. :1.969e+09
## 1st Qu.: 9.00 1st Qu.:2596 1st Qu.:3.271e+09
## Median : 11.00 Median :2795 Median :3.598e+09
## Mean : 14.51 Mean :2819 Mean :3.880e+09
## 3rd Qu.: 14.25 3rd Qu.:2992 3rd Qu.:4.083e+09
## Max. :100.00 Max. :3627 Max. :9.045e+09
## Gold.Price Gold.Volume
## Min. :1176 Min. : 0
## 1st Qu.:1265 1st Qu.: 28
## Median :1313 Median : 135
## Mean :1416 Mean : 6087
## 3rd Qu.:1512 3rd Qu.: 474
## Max. :2052 Max. :386334
We can see through the min and max of the price variables, such as Ethereum’s min of 10.59 and max of 1290, the rapid growth and volatility of cryptocurrency market. Specifically, when cryptocurrency is compared to traditional asset classes, such the S&P500, which had a min of 2237 and max of 3627, and Gold, which has a min of 1176 and max of 2052, over the same period.
hist(master$Volume.BTC, col="blue")
hist(master$Volume.USD, col="blue")
hist(master$Close, col="blue")
## Display Scatter Plots * Display the Close Price over the time,this confirms how volatile Bitcoin prices are
# scatter plots of the data
plot(master$Date, master$Close,pch=20,col="red")
head(master)
## Timestamp Date Open High Low Close Volume.BTC
## 1391 1485734400 2017-01-30 917.31 923.95 914.69 923.45 3678.36
## 1390 1485820800 2017-01-31 923.45 971.24 922.83 970.92 6624.94
## 1389 1485907200 2017-02-01 970.92 991.38 963.84 989.71 5983.96
## 1388 1485993600 2017-02-02 989.71 1010.00 978.74 1007.66 5623.69
## 1387 1486080000 2017-02-03 1007.66 1024.50 994.34 1016.77 6731.61
## 1384 1486339200 2017-02-06 1019.31 1027.70 1014.64 1024.39 4227.33
## Volume.USD HL.Close ETH.Price ETH.Volume LTC.Price LTC.Volume XRP.Price
## 1391 3385239 1 10.59 437746.5 4.03 23569.63 0.00645
## 1390 6298154 1 10.74 413350.2 4.07 35332.93 0.00641
## 1389 5835317 1 10.73 630953.7 4.08 17621.75 0.00649
## 1388 5602317 1 10.82 513774.8 4.09 17753.63 0.00640
## 1387 6815466 1 10.95 531755.4 4.06 15202.71 0.00638
## 1384 4321741 1 11.34 509447.5 3.97 6224.56 0.00638
## XRP.Volume search$Google_Search_Frequency SP500.Price SP500.Volume
## 1391 3249.53 4 2280.90 3591270000
## 1390 13926.48 4 2278.87 4087450000
## 1389 13118.79 4 2279.55 3916610000
## 1388 13887.87 4 2280.85 3807710000
## 1387 12139.60 4 2297.42 3597970000
## 1384 2697.23 4 2292.56 3109050000
## Gold.Price Gold.Volume
## 1391 1193.2 50503
## 1390 1208.6 3212
## 1389 1205.6 1145
## 1388 1216.7 1512
## 1387 1218.5 865
## 1384 1230.0 908
master$Gold.Price<- as.double(master$Gold.Price)
master$Gold.Volume<- as.double(master$Gold.Volume)
plot( master$Volume.USD,master$Close,pch=20,col="red")
plot( master$Volume.BTC,master$Close,pch=20,col="red")
plot( master$ETH.Price,master$Close,pch=20,col="red")
plot( master$ETH.Price,master$Close,pch=20,col="red")
plot( master$ETH.Volume,master$Close,pch=20,col="red")
plot( master$LTC.Price,master$Close,pch=20,col="red")
plot( master$LTC.Volume,master$Close,pch=20,col="red")
plot( master$XRP.Price,master$Close,pch=20,col="red")
plot( master$XRP.Volume,master$Close,pch=20,col="red")
plot( master$SP500.Price,master$Close,pch=20,col="red")
plot( master$SP500.Volume,master$Close,pch=20,col="red")
plot( master$Gold.Price,master$Close,pch=20,col="red")
plot( master$Gold.Volume,master$Close,pch=20,col="red")
#plot( master$HL.Close,master$Close,pch=20,col="red")
#move y variable (HL.close)to the last index
##dont rerun this chunk or the indexing will get messed up
#master <- master[,c(1,2,3,4,5,6,7,8,10,11,12,13,14,15,16,17,18,19,20,9)]
names(master)
## [1] "Timestamp" "Date"
## [3] "Open" "High"
## [5] "Low" "Close"
## [7] "Volume.BTC" "Volume.USD"
## [9] "HL.Close" "ETH.Price"
## [11] "ETH.Volume" "LTC.Price"
## [13] "LTC.Volume" "XRP.Price"
## [15] "XRP.Volume" "search$Google_Search_Frequency"
## [17] "SP500.Price" "SP500.Volume"
## [19] "Gold.Price" "Gold.Volume"
#To convert all values to integers
#cor(as.numeric(RETS), as.numeric(RETS) -> correl
#Correlation matrix cannot have categorical variables and must be numeric
sapply(master,class)
## Timestamp Date
## "integer" "Date"
## Open High
## "numeric" "numeric"
## Low Close
## "numeric" "numeric"
## Volume.BTC Volume.USD
## "numeric" "numeric"
## HL.Close ETH.Price
## "numeric" "numeric"
## ETH.Volume LTC.Price
## "numeric" "numeric"
## LTC.Volume XRP.Price
## "numeric" "numeric"
## XRP.Volume search$Google_Search_Frequency
## "numeric" "numeric"
## SP500.Price SP500.Volume
## "numeric" "numeric"
## Gold.Price Gold.Volume
## "numeric" "numeric"
# Rename
names(master)[names(master) == "search$Google_Search_Frequency"] <- "Google.Search"
#head(master)
#Remove Timestamp, date
master_cor = subset(master, select = -c(Timestamp, Date))
master= master_cor
head(master_cor)
## Open High Low Close Volume.BTC Volume.USD HL.Close ETH.Price
## 1391 917.31 923.95 914.69 923.45 3678.36 3385239 1 10.59
## 1390 923.45 971.24 922.83 970.92 6624.94 6298154 1 10.74
## 1389 970.92 991.38 963.84 989.71 5983.96 5835317 1 10.73
## 1388 989.71 1010.00 978.74 1007.66 5623.69 5602317 1 10.82
## 1387 1007.66 1024.50 994.34 1016.77 6731.61 6815466 1 10.95
## 1384 1019.31 1027.70 1014.64 1024.39 4227.33 4321741 1 11.34
## ETH.Volume LTC.Price LTC.Volume XRP.Price XRP.Volume Google.Search
## 1391 437746.5 4.03 23569.63 0.00645 3249.53 4
## 1390 413350.2 4.07 35332.93 0.00641 13926.48 4
## 1389 630953.7 4.08 17621.75 0.00649 13118.79 4
## 1388 513774.8 4.09 17753.63 0.00640 13887.87 4
## 1387 531755.4 4.06 15202.71 0.00638 12139.60 4
## 1384 509447.5 3.97 6224.56 0.00638 2697.23 4
## SP500.Price SP500.Volume Gold.Price Gold.Volume
## 1391 2280.90 3591270000 1193.2 50503
## 1390 2278.87 4087450000 1208.6 3212
## 1389 2279.55 3916610000 1205.6 1145
## 1388 2280.85 3807710000 1216.7 1512
## 1387 2297.42 3597970000 1218.5 865
## 1384 2292.56 3109050000 1230.0 908
Removed HL.Close since it is our Y variable
x=round(cor(master_cor[ ,1:17]),2)
library("corrplot")
## corrplot 0.84 loaded
cor(master_cor[ ,1:17])
## Open High Low Close Volume.BTC
## Open 1.00000000 0.99614666 0.99385953 0.99398782 0.21378860
## High 0.99614666 1.00000000 0.99160853 0.99728517 0.25297738
## Low 0.99385953 0.99160853 1.00000000 0.99597589 0.15193856
## Close 0.99398782 0.99728517 0.99597589 1.00000000 0.21098044
## Volume.BTC 0.21378860 0.25297738 0.15193856 0.21098044 1.00000000
## Volume.USD 0.62722020 0.66322474 0.57190725 0.62752404 0.81984498
## HL.Close -0.04916218 -0.04944641 -0.05103671 -0.05279288 0.02424055
## ETH.Price 0.57120233 0.57731423 0.55002907 0.56566335 0.21407297
## ETH.Volume 0.39517796 0.40934477 0.33473403 0.37747912 0.58759126
## LTC.Price 0.61142077 0.62015225 0.58566120 0.60656188 0.23692178
## LTC.Volume 0.38258455 0.39652530 0.33444569 0.37431167 0.42595723
## XRP.Price 0.43021218 0.43589057 0.40887769 0.42424650 0.12991861
## XRP.Volume 0.45826224 0.46490252 0.42030343 0.44555048 0.34327036
## Google.Search 0.48658082 0.51339231 0.44401155 0.48907791 0.50178696
## SP500.Price 0.69859731 0.68155501 0.72429409 0.69906597 -0.10012093
## SP500.Volume 0.15383423 0.15449643 0.15617054 0.15441583 0.23529218
## Gold.Price 0.57359361 0.56075600 0.59683323 0.57662753 0.02599601
## Volume.USD HL.Close ETH.Price ETH.Volume LTC.Price
## Open 0.627220202 -0.0491621820 0.57120233 0.395177961 0.61142077
## High 0.663224739 -0.0494464119 0.57731423 0.409344767 0.62015225
## Low 0.571907249 -0.0510367083 0.55002907 0.334734029 0.58566120
## Close 0.627524040 -0.0527928772 0.56566335 0.377479124 0.60656188
## Volume.BTC 0.819844983 0.0242405549 0.21407297 0.587591264 0.23692178
## Volume.USD 1.000000000 -0.0046017805 0.42877175 0.647834573 0.50569552
## HL.Close -0.004601781 1.0000000000 -0.02680736 0.007663671 -0.03813247
## ETH.Price 0.428771749 -0.0268073573 1.00000000 0.666252503 0.83472892
## ETH.Volume 0.647834573 0.0076636710 0.66625250 1.000000000 0.58993239
## LTC.Price 0.505695524 -0.0381324677 0.83472892 0.589932385 1.00000000
## LTC.Volume 0.603150494 -0.0049336060 0.47356322 0.669756933 0.68390439
## XRP.Price 0.301901750 -0.0112122870 0.81737650 0.574323310 0.80081994
## XRP.Volume 0.487882284 0.0435668579 0.51665623 0.695489155 0.52908491
## Google.Search 0.699667142 0.0148804734 0.57148617 0.659257829 0.67084997
## SP500.Price 0.200549244 -0.0368106151 0.09592880 -0.022683036 0.05719449
## SP500.Volume 0.168247659 0.0307668933 -0.10137879 0.027693148 -0.16750611
## Gold.Price 0.217613931 -0.0003576898 0.01365688 -0.004235498 -0.12856161
## LTC.Volume XRP.Price XRP.Volume Google.Search SP500.Price
## Open 0.382584546 0.430212180 0.45826224 0.48658082 0.698597308
## High 0.396525305 0.435890572 0.46490252 0.51339231 0.681555010
## Low 0.334445688 0.408877693 0.42030343 0.44401155 0.724294088
## Close 0.374311669 0.424246499 0.44555048 0.48907791 0.699065970
## Volume.BTC 0.425957234 0.129918612 0.34327036 0.50178696 -0.100120933
## Volume.USD 0.603150494 0.301901750 0.48788228 0.69966714 0.200549244
## HL.Close -0.004933606 -0.011212287 0.04356686 0.01488047 -0.036810615
## ETH.Price 0.473563215 0.817376505 0.51665623 0.57148617 0.095928802
## ETH.Volume 0.669756933 0.574323310 0.69548915 0.65925783 -0.022683036
## LTC.Price 0.683904393 0.800819943 0.52908491 0.67084997 0.057194488
## LTC.Volume 1.000000000 0.377375611 0.44231315 0.74015179 -0.090281630
## XRP.Price 0.377375611 1.000000000 0.69792442 0.45554421 0.002754101
## XRP.Volume 0.442313148 0.697924425 1.00000000 0.52513982 0.103723173
## Google.Search 0.740151787 0.455544206 0.52513982 1.00000000 -0.091750110
## SP500.Price -0.090281630 0.002754101 0.10372317 -0.09175011 1.000000000
## SP500.Volume -0.074001768 -0.141778731 0.13827935 -0.02174655 0.154385820
## Gold.Price -0.132099033 -0.183135446 0.11793121 -0.08913261 0.764878470
## SP500.Volume Gold.Price
## Open 0.15383423 0.5735936055
## High 0.15449643 0.5607560029
## Low 0.15617054 0.5968332252
## Close 0.15441583 0.5766275250
## Volume.BTC 0.23529218 0.0259960126
## Volume.USD 0.16824766 0.2176139306
## HL.Close 0.03076689 -0.0003576898
## ETH.Price -0.10137879 0.0136568846
## ETH.Volume 0.02769315 -0.0042354975
## LTC.Price -0.16750611 -0.1285616073
## LTC.Volume -0.07400177 -0.1320990330
## XRP.Price -0.14177873 -0.1831354460
## XRP.Volume 0.13827935 0.1179312139
## Google.Search -0.02174655 -0.0891326080
## SP500.Price 0.15438582 0.7648784698
## SP500.Volume 1.00000000 0.4536416658
## Gold.Price 0.45364167 1.0000000000
library("Hmisc")
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
library(RColorBrewer)
corrplot((cor(master_cor[ ,1:17])), method = "number")
M <-cor(master_cor)
corrplot(M, type="upper", order="hclust",
col=brewer.pal(n=8, name="RdYlBu"))
## Correlation Matrix without HL.Close
master_cor2 = subset(master_cor, select = -c(HL.Close))
x=round(cor(master_cor2[ ,1:17]),2)
library("corrplot")
cor(master_cor2[ ,1:17])
## Open High Low Close Volume.BTC
## Open 1.00000000 0.99614666 0.99385953 0.99398782 0.213788601
## High 0.99614666 1.00000000 0.99160853 0.99728517 0.252977377
## Low 0.99385953 0.99160853 1.00000000 0.99597589 0.151938564
## Close 0.99398782 0.99728517 0.99597589 1.00000000 0.210980441
## Volume.BTC 0.21378860 0.25297738 0.15193856 0.21098044 1.000000000
## Volume.USD 0.62722020 0.66322474 0.57190725 0.62752404 0.819844983
## ETH.Price 0.57120233 0.57731423 0.55002907 0.56566335 0.214072969
## ETH.Volume 0.39517796 0.40934477 0.33473403 0.37747912 0.587591264
## LTC.Price 0.61142077 0.62015225 0.58566120 0.60656188 0.236921785
## LTC.Volume 0.38258455 0.39652530 0.33444569 0.37431167 0.425957234
## XRP.Price 0.43021218 0.43589057 0.40887769 0.42424650 0.129918612
## XRP.Volume 0.45826224 0.46490252 0.42030343 0.44555048 0.343270365
## Google.Search 0.48658082 0.51339231 0.44401155 0.48907791 0.501786959
## SP500.Price 0.69859731 0.68155501 0.72429409 0.69906597 -0.100120933
## SP500.Volume 0.15383423 0.15449643 0.15617054 0.15441583 0.235292179
## Gold.Price 0.57359361 0.56075600 0.59683323 0.57662753 0.025996013
## Gold.Volume -0.02323081 -0.02153901 -0.02183498 -0.02167915 -0.003082816
## Volume.USD ETH.Price ETH.Volume LTC.Price LTC.Volume
## Open 0.62722020 0.57120233 0.395177961 0.611420771 0.38258455
## High 0.66322474 0.57731423 0.409344767 0.620152247 0.39652530
## Low 0.57190725 0.55002907 0.334734029 0.585661205 0.33444569
## Close 0.62752404 0.56566335 0.377479124 0.606561882 0.37431167
## Volume.BTC 0.81984498 0.21407297 0.587591264 0.236921785 0.42595723
## Volume.USD 1.00000000 0.42877175 0.647834573 0.505695524 0.60315049
## ETH.Price 0.42877175 1.00000000 0.666252503 0.834728924 0.47356322
## ETH.Volume 0.64783457 0.66625250 1.000000000 0.589932385 0.66975693
## LTC.Price 0.50569552 0.83472892 0.589932385 1.000000000 0.68390439
## LTC.Volume 0.60315049 0.47356322 0.669756933 0.683904393 1.00000000
## XRP.Price 0.30190175 0.81737650 0.574323310 0.800819943 0.37737561
## XRP.Volume 0.48788228 0.51665623 0.695489155 0.529084912 0.44231315
## Google.Search 0.69966714 0.57148617 0.659257829 0.670849973 0.74015179
## SP500.Price 0.20054924 0.09592880 -0.022683036 0.057194488 -0.09028163
## SP500.Volume 0.16824766 -0.10137879 0.027693148 -0.167506108 -0.07400177
## Gold.Price 0.21761393 0.01365688 -0.004235498 -0.128561607 -0.13209903
## Gold.Volume -0.02005472 0.01169730 -0.012074959 0.004945084 -0.02073302
## XRP.Price XRP.Volume Google.Search SP500.Price SP500.Volume
## Open 0.430212180 0.45826224 0.48658082 0.698597308 0.15383423
## High 0.435890572 0.46490252 0.51339231 0.681555010 0.15449643
## Low 0.408877693 0.42030343 0.44401155 0.724294088 0.15617054
## Close 0.424246499 0.44555048 0.48907791 0.699065970 0.15441583
## Volume.BTC 0.129918612 0.34327036 0.50178696 -0.100120933 0.23529218
## Volume.USD 0.301901750 0.48788228 0.69966714 0.200549244 0.16824766
## ETH.Price 0.817376505 0.51665623 0.57148617 0.095928802 -0.10137879
## ETH.Volume 0.574323310 0.69548915 0.65925783 -0.022683036 0.02769315
## LTC.Price 0.800819943 0.52908491 0.67084997 0.057194488 -0.16750611
## LTC.Volume 0.377375611 0.44231315 0.74015179 -0.090281630 -0.07400177
## XRP.Price 1.000000000 0.69792442 0.45554421 0.002754101 -0.14177873
## XRP.Volume 0.697924425 1.00000000 0.52513982 0.103723173 0.13827935
## Google.Search 0.455544206 0.52513982 1.00000000 -0.091750110 -0.02174655
## SP500.Price 0.002754101 0.10372317 -0.09175011 1.000000000 0.15438582
## SP500.Volume -0.141778731 0.13827935 -0.02174655 0.154385820 1.00000000
## Gold.Price -0.183135446 0.11793121 -0.08913261 0.764878470 0.45364167
## Gold.Volume 0.009514297 -0.01548651 0.02136058 -0.019322232 -0.01759803
## Gold.Price Gold.Volume
## Open 0.573593605 -0.023230807
## High 0.560756003 -0.021539010
## Low 0.596833225 -0.021834979
## Close 0.576627525 -0.021679153
## Volume.BTC 0.025996013 -0.003082816
## Volume.USD 0.217613931 -0.020054724
## ETH.Price 0.013656885 0.011697303
## ETH.Volume -0.004235498 -0.012074959
## LTC.Price -0.128561607 0.004945084
## LTC.Volume -0.132099033 -0.020733019
## XRP.Price -0.183135446 0.009514297
## XRP.Volume 0.117931214 -0.015486512
## Google.Search -0.089132608 0.021360583
## SP500.Price 0.764878470 -0.019322232
## SP500.Volume 0.453641666 -0.017598033
## Gold.Price 1.000000000 -0.031549672
## Gold.Volume -0.031549672 1.000000000
library("Hmisc")
library(RColorBrewer)
corrplot((cor(master_cor2[ ,1:17])), method = "number")
M <-cor(master_cor2)
corrplot(M, type="upper", order="hclust",
col=brewer.pal(n=8, name="RdYlBu"))
## Plot Part 2 Y value = HL.Close
plot( master_cor2$Volume.USD,master_cor$HL.Close,pch=20,col="red")
plot( master_cor2$Volume.BTC,master_cor$HL.Close,pch=20,col="red")
plot( master_cor2$ETH.Price,master_cor$HL.Close,pch=20,col="red")
plot( master_cor2$ETH.Price,master_cor$HL.Close,pch=20,col="red")
plot( master_cor2$ETH.Volume,master_cor$HL.Close,pch=20,col="red")
plot( master_cor2$LTC.Price,master_cor$HL.Close,pch=20,col="red")
plot( master_cor2$LTC.Volume,master_cor$HL.Close,pch=20,col="red")
plot( master_cor2$XRP.Price,master_cor$HL.Close,pch=20,col="red")
plot( master_cor2$XRP.Volume,master_cor$HL.Close,pch=20,col="red")
plot( master_cor2$SP500.Price,master_cor$HL.Close,pch=20,col="red")
plot( master_cor2$SP500.Volume,master_cor$HL.Close,pch=20,col="red")
plot( master_cor2$Gold.Price,master_cor$HL.Close,pch=20,col="red")
plot( master_cor2$Gold.Volume,master_cor$HL.Close,pch=20,col="red")
### Remove highly correlated variables Typically you would want to remove variables that are highly correlated (0.4+) to avoid multicollinerity However, as shown in the correlation charts, variables are very highly correlated to each other *To ensure that we have enough data points in our model we will be use 0.76 as the correlation cutoff point
library(caret)
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
df2 = cor(master_cor)
hc = findCorrelation(df2, cutoff=0.76) # putt any value as a "cutoff"
hc = sort(hc)
master_reduced = master_cor[,-c(hc)]
head (master_reduced)
## Low Volume.BTC HL.Close ETH.Volume LTC.Volume XRP.Price XRP.Volume
## 1391 914.69 3678.36 1 437746.5 23569.63 0.00645 3249.53
## 1390 922.83 6624.94 1 413350.2 35332.93 0.00641 13926.48
## 1389 963.84 5983.96 1 630953.7 17621.75 0.00649 13118.79
## 1388 978.74 5623.69 1 513774.8 17753.63 0.00640 13887.87
## 1387 994.34 6731.61 1 531755.4 15202.71 0.00638 12139.60
## 1384 1014.64 4227.33 1 509447.5 6224.56 0.00638 2697.23
## Google.Search SP500.Price SP500.Volume Gold.Volume
## 1391 4 2280.90 3591270000 50503
## 1390 4 2278.87 4087450000 3212
## 1389 4 2279.55 3916610000 1145
## 1388 4 2280.85 3807710000 1512
## 1387 4 2297.42 3597970000 865
## 1384 4 2292.56 3109050000 908
*Regression with the eliminated correlation variables
master_reg = master_reduced
# Create Training and Testing Sets
head(master_reg)
## Low Volume.BTC HL.Close ETH.Volume LTC.Volume XRP.Price XRP.Volume
## 1391 914.69 3678.36 1 437746.5 23569.63 0.00645 3249.53
## 1390 922.83 6624.94 1 413350.2 35332.93 0.00641 13926.48
## 1389 963.84 5983.96 1 630953.7 17621.75 0.00649 13118.79
## 1388 978.74 5623.69 1 513774.8 17753.63 0.00640 13887.87
## 1387 994.34 6731.61 1 531755.4 15202.71 0.00638 12139.60
## 1384 1014.64 4227.33 1 509447.5 6224.56 0.00638 2697.23
## Google.Search SP500.Price SP500.Volume Gold.Volume
## 1391 4 2280.90 3591270000 50503
## 1390 4 2278.87 4087450000 3212
## 1389 4 2279.55 3916610000 1145
## 1388 4 2280.85 3807710000 1512
## 1387 4 2297.42 3597970000 865
## 1384 4 2292.56 3109050000 908
num_samples = dim(master_reg)[1]
sampling.rate = 0.8
training <- sample(1:num_samples, sampling.rate * num_samples, replace=FALSE)
trainingSet <- master[training, ]
testing <- setdiff(1:num_samples,training)
testingSet <- master[testing, ]
Create regression model, Logistic Regression with the removed correlated variables
# Create Regression Model
LogisticReg <- glm(HL.Close ~ Low + Volume.BTC + ETH.Volume + LTC.Volume + XRP.Price + XRP.Volume + Google.Search + SP500.Price+ SP500.Volume+ Gold.Volume, data = trainingSet, family = binomial(logit))
#get summary statistics
summary(LogisticReg)
##
## Call:
## glm(formula = HL.Close ~ Low + Volume.BTC + ETH.Volume + LTC.Volume +
## XRP.Price + XRP.Volume + Google.Search + SP500.Price + SP500.Volume +
## Gold.Volume, family = binomial(logit), data = trainingSet)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0236 -1.2504 0.9491 1.0757 1.4711
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.164e-02 1.398e+00 0.037 0.971
## Low -8.809e-05 5.603e-05 -1.572 0.116
## Volume.BTC -1.502e-06 8.767e-06 -0.171 0.864
## ETH.Volume -2.818e-09 2.113e-09 -1.333 0.182
## LTC.Volume -1.020e-09 1.879e-09 -0.543 0.587
## XRP.Price -4.369e-01 4.502e-01 -0.970 0.332
## XRP.Volume 1.235e-08 5.309e-09 2.327 0.020 *
## Google.Search 1.704e-02 1.172e-02 1.454 0.146
## SP500.Price 2.920e-04 5.331e-04 0.548 0.584
## SP500.Volume -3.462e-11 8.527e-11 -0.406 0.685
## Gold.Volume -7.549e-09 2.320e-06 -0.003 0.997
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1045.9 on 760 degrees of freedom
## Residual deviance: 1032.6 on 750 degrees of freedom
## AIC: 1054.6
##
## Number of Fisher Scoring iterations: 4
Removal of SP500.Volume Eliminate insignificant variables one by one, starting with the variable with the highest P value
LogisticReg <- glm(HL.Close ~ Low + Volume.BTC + ETH.Volume + LTC.Volume + XRP.Price + XRP.Volume + Google.Search + SP500.Price+ Gold.Volume, data = trainingSet, family = binomial(logit))
#get summary statistics
summary(LogisticReg)
##
## Call:
## glm(formula = HL.Close ~ Low + Volume.BTC + ETH.Volume + LTC.Volume +
## XRP.Price + XRP.Volume + Google.Search + SP500.Price + Gold.Volume,
## family = binomial(logit), data = trainingSet)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9968 -1.2469 0.9526 1.0795 1.4595
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.604e-01 1.296e+00 -0.124 0.9015
## Low -9.385e-05 5.420e-05 -1.731 0.0834 .
## Volume.BTC -2.423e-06 8.475e-06 -0.286 0.7750
## ETH.Volume -2.757e-09 2.102e-09 -1.312 0.1896
## LTC.Volume -9.179e-10 1.850e-09 -0.496 0.6197
## XRP.Price -3.772e-01 4.251e-01 -0.887 0.3749
## XRP.Volume 1.166e-08 4.988e-09 2.339 0.0194 *
## Google.Search 1.766e-02 1.161e-02 1.521 0.1282
## SP500.Price 3.300e-04 5.247e-04 0.629 0.5294
## Gold.Volume 5.213e-09 2.318e-06 0.002 0.9982
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1045.9 on 760 degrees of freedom
## Residual deviance: 1032.8 on 751 degrees of freedom
## AIC: 1052.8
##
## Number of Fisher Scoring iterations: 4
Removal of LTC.Volume
LogisticReg <- glm(HL.Close ~ Low + Volume.BTC + ETH.Volume + XRP.Price + XRP.Volume + Google.Search + SP500.Price+ Gold.Volume, data = trainingSet, family = binomial(logit))
#get summary statistics
summary(LogisticReg)
##
## Call:
## glm(formula = HL.Close ~ Low + Volume.BTC + ETH.Volume + XRP.Price +
## XRP.Volume + Google.Search + SP500.Price + Gold.Volume, family = binomial(logit),
## data = trainingSet)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9609 -1.2472 0.9498 1.0798 1.3882
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.271e-01 1.290e+00 -0.176 0.8603
## Low -9.755e-05 5.376e-05 -1.814 0.0696 .
## Volume.BTC -1.949e-06 8.405e-06 -0.232 0.8167
## ETH.Volume -3.151e-09 1.946e-09 -1.619 0.1054
## XRP.Price -3.532e-01 4.210e-01 -0.839 0.4016
## XRP.Volume 1.190e-08 4.965e-09 2.397 0.0165 *
## Google.Search 1.512e-02 1.032e-02 1.465 0.1429
## SP500.Price 3.668e-04 5.201e-04 0.705 0.4807
## Gold.Volume 4.404e-08 2.317e-06 0.019 0.9848
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1045.9 on 760 degrees of freedom
## Residual deviance: 1033.0 on 752 degrees of freedom
## AIC: 1051
##
## Number of Fisher Scoring iterations: 4
Removal of XRP.Price
LogisticReg <- glm(HL.Close ~ Low + Volume.BTC + ETH.Volume + XRP.Volume + Google.Search + SP500.Price+ Gold.Volume, data = trainingSet, family = binomial(logit))
#get summary statistics
summary(LogisticReg)
##
## Call:
## glm(formula = HL.Close ~ Low + Volume.BTC + ETH.Volume + XRP.Volume +
## Google.Search + SP500.Price + Gold.Volume, family = binomial(logit),
## data = trainingSet)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9167 -1.2469 0.9708 1.0800 1.4799
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.037e-01 1.158e+00 -0.608 0.5433
## Low -1.154e-04 4.940e-05 -2.337 0.0194 *
## Volume.BTC 3.866e-07 7.951e-06 0.049 0.9612
## ETH.Volume -3.562e-09 1.889e-09 -1.886 0.0593 .
## XRP.Volume 1.004e-08 4.457e-09 2.252 0.0243 *
## Google.Search 1.639e-02 1.019e-02 1.608 0.1078
## SP500.Price 5.356e-04 4.796e-04 1.117 0.2641
## Gold.Volume -7.138e-08 2.313e-06 -0.031 0.9754
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1045.9 on 760 degrees of freedom
## Residual deviance: 1033.8 on 753 degrees of freedom
## AIC: 1049.8
##
## Number of Fisher Scoring iterations: 4
Removal of Gold.Volume
LogisticReg <- glm(HL.Close ~ Low + Volume.BTC + ETH.Volume + XRP.Volume + Google.Search + SP500.Price, data = trainingSet, family = binomial(logit))
#get summary statistics
summary(LogisticReg)
##
## Call:
## glm(formula = HL.Close ~ Low + Volume.BTC + ETH.Volume + XRP.Volume +
## Google.Search + SP500.Price, family = binomial(logit), data = trainingSet)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9165 -1.2467 0.9709 1.0778 1.4801
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.048e-01 1.157e+00 -0.609 0.5425
## Low -1.154e-04 4.940e-05 -2.337 0.0195 *
## Volume.BTC 3.856e-07 7.951e-06 0.049 0.9613
## ETH.Volume -3.562e-09 1.889e-09 -1.886 0.0593 .
## XRP.Volume 1.004e-08 4.457e-09 2.252 0.0243 *
## Google.Search 1.638e-02 1.019e-02 1.608 0.1079
## SP500.Price 5.358e-04 4.796e-04 1.117 0.2639
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1045.9 on 760 degrees of freedom
## Residual deviance: 1033.8 on 754 degrees of freedom
## AIC: 1047.8
##
## Number of Fisher Scoring iterations: 4
Removal of BTC.Volume * This will be last iteration of the regression_v1. Although the P value for every variable is except “Low” and “XRP.Volume” is higher than the benchmark of 0.05. * This indicates most of the variables we have used are statically insignificant * If we were to extend this analysis, we would look into ANOVA and Lower and Upper 95% to get a stronger understanding about the coefficients/variables
LogisticReg <- glm(HL.Close ~ Low + ETH.Volume + XRP.Volume + Google.Search + SP500.Price, data = trainingSet, family = binomial(logit))
#get summary statistics
summary(LogisticReg)
##
## Call:
## glm(formula = HL.Close ~ Low + ETH.Volume + XRP.Volume + Google.Search +
## SP500.Price, family = binomial(logit), data = trainingSet)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9186 -1.2474 0.9711 1.0781 1.4741
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.013e-01 1.155e+00 -0.607 0.5437
## Low -1.156e-04 4.928e-05 -2.346 0.0190 *
## ETH.Volume -3.528e-09 1.755e-09 -2.011 0.0443 *
## XRP.Volume 1.003e-08 4.453e-09 2.251 0.0244 *
## Google.Search 1.647e-02 1.004e-02 1.640 0.1010
## SP500.Price 5.361e-04 4.795e-04 1.118 0.2635
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1045.9 on 760 degrees of freedom
## Residual deviance: 1033.8 on 755 degrees of freedom
## AIC: 1045.8
##
## Number of Fisher Scoring iterations: 4
Only include stastically signifcant varaibles ( Low and XRP.Volume)
LogisticReg_v2 <- glm(HL.Close ~ Low + XRP.Volume, data = trainingSet, family = binomial(logit))
#get summary statistics
summary(LogisticReg_v2)
##
## Call:
## glm(formula = HL.Close ~ Low + XRP.Volume, family = binomial(logit),
## data = trainingSet)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6534 -1.2453 0.9809 1.0932 1.3124
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.489e-01 1.701e-01 3.226 0.00126 **
## Low -5.929e-05 2.395e-05 -2.476 0.01329 *
## XRP.Volume 5.195e-09 3.090e-09 1.681 0.09271 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1045.9 on 760 degrees of freedom
## Residual deviance: 1039.0 on 758 degrees of freedom
## AIC: 1045
##
## Number of Fisher Scoring iterations: 4
# Perform prdictions for the testing set
predictions <-predict(LogisticReg, testingSet, type = "response")
predictedLabels <- round(predictions)
We compute the misclassification rate regression V1 (the rate of incorrect predictions).
# Get the number of data points in the test set
sizeTestSet = dim(testingSet)[1]
# Get the number of data points that are misclassified
error = sum(predictedLabels != testingSet$HL.Close)
# Calculate the misclassification rate
misclassification_rate = error/sizeTestSet
# Display the misclassification rate
print(misclassification_rate)
## [1] 0.4764398
# Perform prdictions for the testing set
predictions <-predict(LogisticReg_v2, testingSet, type = "response")
The predict function returns continuous values between 0 and 1. We need to convert these values to the discrete 0/1 classes
predictedLabels <- round(predictions)
# Get the number of data points in the test set
sizeTestSet = dim(testingSet)[1]
# Get the number of data points that are misclassified
error = sum(predictedLabels != testingSet$HL.Close)
# Calculate the misclassification rate
misclassification_rate = error/sizeTestSet
# Display the misclassification rate
print(misclassification_rate)
## [1] 0.4659686
master_tree = master
master_tree$HL.Close = factor(master_tree$HL.Close, levels=c(0,1), labels = c("L", "H"))
levels((master_tree$HL.Close))
## [1] "L" "H"
Create Training and Testing sets (Note that this data set is small so let us keep 90% for training)
# Create Training and Testing Sets
num_samples = dim(master_tree)[1]
sampling.rate = 0.9
training <- sample(1:num_samples, sampling.rate * num_samples, replace=FALSE)
trainingSet <- master_tree[training, ]
testing <- setdiff(1:num_samples,training)
testingSet <- master_tree[testing, ]
Fit a decision tree to predict rating using all the other variables.
library(rpart)
#Fit a decision tree model using the training data
decTreeModel <- rpart(HL.Close ~ .,data=trainingSet, method = "class")
Display the tree
plot(decTreeModel, margin=0.1)
text(decTreeModel)
library(rpart.plot)
rpart.plot(decTreeModel)
Tune the size of the tree to avoid overfitting
plotcp(decTreeModel)
Prune the tree at a cp = 0.018 * Check if this right
pruned_decTreeModel = prune(decTreeModel, cp=0.018)
# Display pruned tree
plot(pruned_decTreeModel, margin=0.1)
text(pruned_decTreeModel)
rpart.plot(pruned_decTreeModel)
Evaluate the decision tree model using the testing set
# Perform prdictions for the testing set
predictedLabels<-predict(pruned_decTreeModel, testingSet, type = "class")
print(predictedLabels)
## 1373 1367 1366 1349 1347 1342 1321 1299 1248 1228 1222 1201 1199 1179 1171 1121
## H H H H H H H H H H H H H H H H
## 1117 1111 1054 1040 1031 985 983 954 922 921 914 906 899 883 873 866
## H H H H H H H H H H H H H H H L
## 855 838 837 821 817 813 801 800 794 789 778 764 753 752 726 705
## H H H H H H H H H L L L L L H H
## 703 701 689 676 675 666 619 613 579 561 558 515 512 508 498 451
## H H H H H H H L H H H H H H H H
## 442 428 414 401 396 381 347 344 330 288 270 256 232 207 198 187
## H H H L H H H H H H H H H H H H
## 183 165 162 157 150 145 116 100 99 96 94 22 15 11 3 2
## H H H H H H H H H H H H H H H H
## Levels: L H
Show the true labels
print(testingSet$HL.Close)
## [1] H H L H L H H H H L L L H H L L H H H L L H H L L L L H H H L L H L L H H H
## [39] H L H L L H H H H H H H L H H L H L H H H H L H L L H L L L L L L L H H L H
## [77] H H L H H H H L L L H L H L H H L H H H
## Levels: L H
# Get the number of data points in the test set
sizeTestSet = dim(testingSet)[1]
# Get the number of data points that are misclassified
error = sum(predictedLabels != testingSet$HL.Close)
# Calculate the misclassification rate
misclassification_rate = error/sizeTestSet
# Display the misclassification rate
print(misclassification_rate)
## [1] 0.40625
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
#Factors of the Y varaiable
master_forest = master_tree
head(master_forest)
## Open High Low Close Volume.BTC Volume.USD HL.Close ETH.Price
## 1391 917.31 923.95 914.69 923.45 3678.36 3385239 H 10.59
## 1390 923.45 971.24 922.83 970.92 6624.94 6298154 H 10.74
## 1389 970.92 991.38 963.84 989.71 5983.96 5835317 H 10.73
## 1388 989.71 1010.00 978.74 1007.66 5623.69 5602317 H 10.82
## 1387 1007.66 1024.50 994.34 1016.77 6731.61 6815466 H 10.95
## 1384 1019.31 1027.70 1014.64 1024.39 4227.33 4321741 H 11.34
## ETH.Volume LTC.Price LTC.Volume XRP.Price XRP.Volume Google.Search
## 1391 437746.5 4.03 23569.63 0.00645 3249.53 4
## 1390 413350.2 4.07 35332.93 0.00641 13926.48 4
## 1389 630953.7 4.08 17621.75 0.00649 13118.79 4
## 1388 513774.8 4.09 17753.63 0.00640 13887.87 4
## 1387 531755.4 4.06 15202.71 0.00638 12139.60 4
## 1384 509447.5 3.97 6224.56 0.00638 2697.23 4
## SP500.Price SP500.Volume Gold.Price Gold.Volume
## 1391 2280.90 3591270000 1193.2 50503
## 1390 2278.87 4087450000 1208.6 3212
## 1389 2279.55 3916610000 1205.6 1145
## 1388 2280.85 3807710000 1216.7 1512
## 1387 2297.42 3597970000 1218.5 865
## 1384 2292.56 3109050000 1230.0 908
# Create Training and Testing Sets
num_samples = dim(master_forest)[1]
sampling.rate = 0.9
training <- sample(1:num_samples, sampling.rate * num_samples, replace=FALSE)
trainingSet <- master_forest[training, ]
testing <- setdiff(1:num_samples,training)
testingSet <- master_forest[testing, ]
RandForestModel <- randomForest(HL.Close ~ ., data = trainingSet)
Interpret the graph
plot(RandForestModel)
legend("top", colnames(RandForestModel$err.rate),fill=1:3)
# Perform predictions for the testing set
predictedLabels<-predict(RandForestModel, testingSet)
# Get the number of data points in the test set
sizeTestSet = dim(testingSet)[1]
# Get the number of data points that are misclassified
error = sum(predictedLabels != testingSet$HL.Close)
# Calculate the misclassification rate
misclassification_rate = error/sizeTestSet
# Display the misclassification rate
print(misclassification_rate)
## [1] 0.4166667
# ensure results are repeatable
set.seed(7)
# load the library
library(mlbench)
library(caret)
# load the dataset
data(master_tree)
## Warning in data(master_tree): data set 'master_tree' not found
# prepare training scheme
control <- trainControl(method="repeatedcv", number=10, repeats=3)
# train the model
model <- train(HL.Close~., data=master_tree, method="lvq", preProcess="scale", trControl=control)
# estimate variable importance
importance <- varImp(model, scale=FALSE)
# summarize importance
print(importance)
## ROC curve variable importance
##
## Importance
## LTC.Price 0.5359
## Close 0.5357
## Gold.Volume 0.5342
## Low 0.5339
## High 0.5314
## XRP.Price 0.5310
## Open 0.5309
## SP500.Volume 0.5308
## Volume.BTC 0.5273
## SP500.Price 0.5266
## ETH.Price 0.5232
## Google.Search 0.5201
## LTC.Volume 0.5095
## Gold.Price 0.5089
## Volume.USD 0.5023
## ETH.Volume 0.5022
## XRP.Volume 0.5006
# plot importance
plot(importance)
# KNN
master_knn = master
head(master_knn)
## Open High Low Close Volume.BTC Volume.USD HL.Close ETH.Price
## 1391 917.31 923.95 914.69 923.45 3678.36 3385239 1 10.59
## 1390 923.45 971.24 922.83 970.92 6624.94 6298154 1 10.74
## 1389 970.92 991.38 963.84 989.71 5983.96 5835317 1 10.73
## 1388 989.71 1010.00 978.74 1007.66 5623.69 5602317 1 10.82
## 1387 1007.66 1024.50 994.34 1016.77 6731.61 6815466 1 10.95
## 1384 1019.31 1027.70 1014.64 1024.39 4227.33 4321741 1 11.34
## ETH.Volume LTC.Price LTC.Volume XRP.Price XRP.Volume Google.Search
## 1391 437746.5 4.03 23569.63 0.00645 3249.53 4
## 1390 413350.2 4.07 35332.93 0.00641 13926.48 4
## 1389 630953.7 4.08 17621.75 0.00649 13118.79 4
## 1388 513774.8 4.09 17753.63 0.00640 13887.87 4
## 1387 531755.4 4.06 15202.71 0.00638 12139.60 4
## 1384 509447.5 3.97 6224.56 0.00638 2697.23 4
## SP500.Price SP500.Volume Gold.Price Gold.Volume
## 1391 2280.90 3591270000 1193.2 50503
## 1390 2278.87 4087450000 1208.6 3212
## 1389 2279.55 3916610000 1205.6 1145
## 1388 2280.85 3807710000 1216.7 1512
## 1387 2297.42 3597970000 1218.5 865
## 1384 2292.56 3109050000 1230.0 908
master_knn <- master_knn[c(7,1,2,3,4,5,6,8,9,10,11,12,13,14,15,16,17,18)]
head(master_knn)
## HL.Close Open High Low Close Volume.BTC Volume.USD ETH.Price
## 1391 1 917.31 923.95 914.69 923.45 3678.36 3385239 10.59
## 1390 1 923.45 971.24 922.83 970.92 6624.94 6298154 10.74
## 1389 1 970.92 991.38 963.84 989.71 5983.96 5835317 10.73
## 1388 1 989.71 1010.00 978.74 1007.66 5623.69 5602317 10.82
## 1387 1 1007.66 1024.50 994.34 1016.77 6731.61 6815466 10.95
## 1384 1 1019.31 1027.70 1014.64 1024.39 4227.33 4321741 11.34
## ETH.Volume LTC.Price LTC.Volume XRP.Price XRP.Volume Google.Search
## 1391 437746.5 4.03 23569.63 0.00645 3249.53 4
## 1390 413350.2 4.07 35332.93 0.00641 13926.48 4
## 1389 630953.7 4.08 17621.75 0.00649 13118.79 4
## 1388 513774.8 4.09 17753.63 0.00640 13887.87 4
## 1387 531755.4 4.06 15202.71 0.00638 12139.60 4
## 1384 509447.5 3.97 6224.56 0.00638 2697.23 4
## SP500.Price SP500.Volume Gold.Price Gold.Volume
## 1391 2280.90 3591270000 1193.2 50503
## 1390 2278.87 4087450000 1208.6 3212
## 1389 2279.55 3916610000 1205.6 1145
## 1388 2280.85 3807710000 1216.7 1512
## 1387 2297.42 3597970000 1218.5 865
## 1384 2292.56 3109050000 1230.0 908
# Normalize All the Attributes ( NOT HL.CLOSE)
master_knn$Open = (master_knn$Open-mean(master_knn$Open))/sd(master_knn$Open)
master_knn$High = (master_knn$High-mean(master_knn$High))/sd(master_knn$High)
master_knn$Low = (master_knn$Low-mean(master_knn$Low))/sd(master_knn$Low)
master_knn$Close = (master_knn$Close-mean(master_knn$Close))/sd(master_knn$Close)
master_knn$Volume.BTC = (master_knn$Volume.BTC-mean(master_knn$Volume.BTC))/sd(master_knn$Volume.BTC)
master_knn$Volume.USD = (master_knn$Volume.USD-mean(master_knn$Volume.USD))/sd(master_knn$Volume.USD)
master_knn$ETH.Price = (master_knn$ETH.Price-mean(master_knn$ETH.Price))/sd(master_knn$ETH.Price)
master_knn$ETH.Volume = (master_knn$ETH.Volume-mean(master_knn$ETH.Volume))/sd(master_knn$ETH.Volume)
master_knn$LTC.Price = (master_knn$LTC.Price-mean(master_knn$LTC.Price))/sd(master_knn$LTC.Price)
master_knn$LTC.Volume = (master_knn$LTC.Volume-mean(master_knn$LTC.Volume))/sd(master_knn$LTC.Volume)
master_knn$XRP.Price = (master_knn$XRP.Price-mean(master_knn$XRP.Price))/sd(master_knn$XRP.Price)
master_knn$XRP.Volume = (master_knn$XRP.Volume-mean(master_knn$XRP.Volume))/sd(master_knn$XRP.Volume)
master_knn$Google.Search = (master_knn$Google.Search-mean(master_knn$Google.Search))/sd(master_knn$Google.Search)
master_knn$SP500.Price = (master_knn$SP500.Price-mean(master_knn$SP500.Price))/sd(master_knn$SP500.Price)
master_knn$SP500.Volume = (master_knn$SP500.Volume-mean(master_knn$SP500.Volume))/sd(master_knn$SP500.Volume)
master_knn$Gold.Price = (master_knn$Gold.Price-mean(master_knn$Gold.Price))/sd(master_knn$Gold.Price)
master_knn$Gold.Volume = (master_knn$Gold.Volume-mean(master_knn$Gold.Volume))/sd(master_knn$Gold.Volume)
# Create Training and Testing Sets
num_samples = dim(master_knn)[1]
sampling.rate = 0.8
training <- sample(1:num_samples, sampling.rate * num_samples, replace=FALSE)
trainingSet <- master_knn[training, ]
testing <- setdiff(1:num_samples,training)
testingSet <- master_knn[testing, ]
# Get the features of the training set
trainingfeatures <- subset(trainingSet, select=c(-HL.Close))
# Get the labels of the training set
traininglabels <- trainingSet$HL.Close
# Get the features of the testing set
testingfeatures <- subset(testingSet, select=c(-HL.Close))
# Load the classification library
library(class)
# call KNN with k=3
predictedLabels = knn(trainingfeatures,testingfeatures,traininglabels,k=3)
Display the predicted Labels
head(predictedLabels)
## [1] 1 0 1 1 1 1
## Levels: 0 1
# Get the number of data points in the test set
sizeTestSet = dim(testingSet)[1]
# Get the number of data points that are misclassified
error = sum(predictedLabels != testingSet$HL.Close)
# Calculate the misclassification rate
misclassification_rate = error/sizeTestSet
# Display the misclassification rate
print(misclassification_rate)
## [1] 0.460733
master_knnv2= subset(master_knn, select=c(HL.Close,Close,LTC.Price))
head(master_knnv2)
## HL.Close Close LTC.Price
## 1391 1 -1.789247 -1.278820
## 1390 1 -1.775800 -1.278050
## 1389 1 -1.770477 -1.277857
## 1388 1 -1.765393 -1.277665
## 1387 1 -1.762812 -1.278242
## 1384 1 -1.760654 -1.279976
# Create Training and Testing Sets
num_samples = dim(master_knnv2)[1]
sampling.rate = 0.9
training <- sample(1:num_samples, sampling.rate * num_samples, replace=FALSE)
trainingSet <- master_knnv2[training, ]
testing <- setdiff(1:num_samples,training)
testingSet <- master_knnv2[testing, ]
# Get the features of the training set
trainingfeatures <- subset(trainingSet, select=c(-HL.Close))
# Get the labels of the training set
traininglabels <- trainingSet$HL.Close
# Get the features of the testing set
testingfeatures <- subset(testingSet, select=c(-HL.Close))
# Load the classification library
library(class)
# call KNN with k=3
predictedLabels = knn(trainingfeatures,testingfeatures,traininglabels,k=3)
Display the predicted Labels
head(predictedLabels)
## [1] 1 1 1 0 1 1
## Levels: 0 1
# Get the number of data points in the test set
sizeTestSet = dim(testingSet)[1]
# Get the number of data points that are misclassified
error = sum(predictedLabels != testingSet$HL.Close)
# Calculate the misclassification rate
misclassification_rate = error/sizeTestSet
# Display the misclassification rate
print(misclassification_rate)
## [1] 0.4791667
master_knnv3= subset(master_knn, select=c(HL.Close,Low,XRP.Volume))
head(master_knnv3)
## HL.Close Low XRP.Volume
## 1391 1 -1.804629 -0.5685338
## 1390 1 -1.802205 -0.5681487
## 1389 1 -1.789993 -0.5681778
## 1388 1 -1.785556 -0.5681501
## 1387 1 -1.780911 -0.5682131
## 1384 1 -1.774866 -0.5685537
# Create Training and Testing Sets
num_samples = dim(master_knnv3)[1]
sampling.rate = 0.9
training <- sample(1:num_samples, sampling.rate * num_samples, replace=FALSE)
trainingSet <- master_knnv3[training, ]
testing <- setdiff(1:num_samples,training)
testingSet <- master_knnv3[testing, ]
# Get the features of the training set
trainingfeatures <- subset(trainingSet, select=c(-HL.Close))
# Get the labels of the training set
traininglabels <- trainingSet$HL.Close
# Get the features of the testing set
testingfeatures <- subset(testingSet, select=c(-HL.Close))
# Load the classification library
library(class)
# call KNN with k=3
predictedLabels = knn(trainingfeatures,testingfeatures,traininglabels,k=3)
Display the predicted Labels
head(predictedLabels)
## [1] 1 0 1 1 1 0
## Levels: 0 1
# Get the number of data points in the test set
sizeTestSet = dim(testingSet)[1]
# Get the number of data points that are misclassified
error = sum(predictedLabels != testingSet$HL.Close)
# Calculate the misclassification rate
misclassification_rate = error/sizeTestSet
# Display the misclassification rate
print(misclassification_rate)
## [1] 0.4479167
set.seed(7)
# load the library
library(mlbench)
library(caret)
# load the data
data(master_knn)
## Warning in data(master_knn): data set 'master_knn' not found
# define the control using a random forest selection function
control <- rfeControl(functions=rfFuncs, method="cv", number=10)
# run the RFE algorithm
results <- rfe(master_knn[,1:17], master_knn[,18], sizes=c(1:8), rfeControl=control)
# summarize the results
print(results)
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (10 fold)
##
## Resampling performance over subset size:
##
## Variables RMSE Rsquared MAE RMSESD RsquaredSD MAESD Selected
## 1 1.1377 0.003179 0.3261 0.3477 0.00658 0.10423
## 2 1.0079 0.016361 0.3158 0.3771 0.02836 0.10000
## 3 0.9833 0.019386 0.3241 0.3643 0.04547 0.09275
## 4 0.9652 0.012435 0.3134 0.3720 0.02613 0.10128
## 5 0.9603 0.008677 0.3128 0.3712 0.01597 0.10005 *
## 6 0.9676 0.007810 0.3172 0.3668 0.01201 0.10322
## 7 0.9698 0.009991 0.3210 0.3675 0.01536 0.10259
## 8 0.9637 0.015377 0.3194 0.3678 0.02498 0.10025
## 17 0.9695 0.018932 0.3368 0.3712 0.03602 0.09967
##
## The top 5 variables (out of 5):
## ETH.Price, SP500.Price, XRP.Volume, LTC.Price, ETH.Volume
# list the chosen features
predictors(results)
## [1] "ETH.Price" "SP500.Price" "XRP.Volume" "LTC.Price" "ETH.Volume"
# plot the results
plot(results, type=c("g", "o"))
# Recursive Feature Selection
master_rfe= master
y= master_rfe$HL.Close
x = subset(master_rfe, select = -c(HL.Close))
normalization = preProcess(x)
x = predict(normalization,x)
x= as.data.frame(x)
head(x)
## Open High Low Close Volume.BTC Volume.USD ETH.Price
## 1391 -1.792960 -1.787230 -1.804629 -1.789247 -0.9967384 -0.9056995 -1.325329
## 1390 -1.791214 -1.774265 -1.802205 -1.775800 -0.7381414 -0.8824137 -1.324625
## 1389 -1.777715 -1.768744 -1.789993 -1.770477 -0.7943950 -0.8861136 -1.324671
## 1388 -1.772372 -1.763639 -1.785556 -1.765393 -0.8260129 -0.8879762 -1.324249
## 1387 -1.767268 -1.759664 -1.780911 -1.762812 -0.7287799 -0.8782783 -1.323639
## 1384 -1.763956 -1.758787 -1.774866 -1.760654 -0.9485599 -0.8982131 -1.321809
## ETH.Volume LTC.Price LTC.Volume XRP.Price XRP.Volume Google.Search
## 1391 -0.7388230 -1.278820 -0.4207717 -1.159030 -0.5685338 -0.8033421
## 1390 -0.7391833 -1.278050 -0.4205950 -1.159165 -0.5681487 -0.8033421
## 1389 -0.7359696 -1.277857 -0.4208610 -1.158896 -0.5681778 -0.8033421
## 1388 -0.7377002 -1.277665 -0.4208590 -1.159198 -0.5681501 -0.8033421
## 1387 -0.7374346 -1.278242 -0.4208973 -1.159265 -0.5682131 -0.8033421
## 1384 -0.7377641 -1.279976 -0.4210321 -1.159265 -0.5685537 -0.8033421
## SP500.Price SP500.Volume Gold.Price Gold.Volume
## 1391 -1.765237 -0.27908820 -1.0146212 1.27925706
## 1390 -1.771899 0.20060822 -0.9445742 -0.08279469
## 1389 -1.769667 0.03544369 -0.9582197 -0.14232739
## 1388 -1.765401 -0.06983854 -0.9077314 -0.13175724
## 1387 -1.711019 -0.27261078 -0.8995439 -0.15039181
## 1384 -1.726969 -0.74528838 -0.8472362 -0.14915335
head(y)
## [1] 1 1 1 1 1 1
subsets = c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18)
set.seed(10)
ctrl = rfeControl(functions = lmFuncs, method = "repeatedcv", repeats = 3, verbose = FALSE)
lmProfile = rfe(x, y, sizes = subsets, rfeControl = ctrl)
lmProfile
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (10 fold, repeated 3 times)
##
## Resampling performance over subset size:
##
## Variables RMSE Rsquared MAE RMSESD RsquaredSD MAESD Selected
## 1 0.4984 0.017182 0.4954 0.006789 0.022279 0.006055 *
## 2 0.4986 0.015091 0.4954 0.006878 0.020125 0.006185
## 3 0.4991 0.011846 0.4956 0.006826 0.013979 0.006199
## 4 0.5000 0.008700 0.4951 0.007351 0.009308 0.006500
## 5 0.4999 0.011001 0.4942 0.008156 0.010876 0.007341
## 6 0.5001 0.011282 0.4936 0.008556 0.012683 0.007617
## 7 0.5003 0.012203 0.4931 0.009126 0.014069 0.008019
## 8 0.5007 0.009153 0.4927 0.008666 0.008585 0.007483
## 9 0.5007 0.009738 0.4923 0.009013 0.010582 0.007648
## 10 0.5010 0.009486 0.4924 0.009131 0.011287 0.007831
## 11 0.5009 0.009580 0.4921 0.009023 0.012408 0.007816
## 12 0.5017 0.008454 0.4927 0.009423 0.011090 0.008061
## 13 0.5020 0.007977 0.4930 0.009407 0.010644 0.008014
## 14 0.5024 0.007374 0.4933 0.009250 0.009620 0.007751
## 15 0.5025 0.007248 0.4935 0.009367 0.009122 0.007891
## 16 0.5026 0.007254 0.4936 0.009418 0.009115 0.007954
## 17 0.5026 0.007285 0.4936 0.009430 0.009131 0.007965
##
## The top 1 variables (out of 1):
## Low
predictors(lmProfile)
## [1] "Low"
lmProfile$fit
##
## Call:
## lm(formula = y ~ ., data = tmp)
##
## Coefficients:
## (Intercept) Low
## 0.54832 -0.02541
head(lmProfile$resample)
## Variables RMSE Rsquared MAE Resample
## 1 1 0.4944017 5.723296e-05 0.4919751 Fold01.Rep1
## 18 1 0.4897414 6.199280e-03 0.4877692 Fold02.Rep1
## 35 1 0.5014909 6.588659e-03 0.4981807 Fold03.Rep1
## 52 1 0.5152455 3.860430e-02 0.5102679 Fold04.Rep1
## 69 1 0.4964578 1.985132e-02 0.4935238 Fold05.Rep1
## 86 1 0.4995176 7.608472e-03 0.4966717 Fold06.Rep1
trellis.par.set(caretTheme())
plot(lmProfile, type = c("g", "o"))
# SVM
master_svm = master_cor
head(master_svm)
## Open High Low Close Volume.BTC Volume.USD HL.Close ETH.Price
## 1391 917.31 923.95 914.69 923.45 3678.36 3385239 1 10.59
## 1390 923.45 971.24 922.83 970.92 6624.94 6298154 1 10.74
## 1389 970.92 991.38 963.84 989.71 5983.96 5835317 1 10.73
## 1388 989.71 1010.00 978.74 1007.66 5623.69 5602317 1 10.82
## 1387 1007.66 1024.50 994.34 1016.77 6731.61 6815466 1 10.95
## 1384 1019.31 1027.70 1014.64 1024.39 4227.33 4321741 1 11.34
## ETH.Volume LTC.Price LTC.Volume XRP.Price XRP.Volume Google.Search
## 1391 437746.5 4.03 23569.63 0.00645 3249.53 4
## 1390 413350.2 4.07 35332.93 0.00641 13926.48 4
## 1389 630953.7 4.08 17621.75 0.00649 13118.79 4
## 1388 513774.8 4.09 17753.63 0.00640 13887.87 4
## 1387 531755.4 4.06 15202.71 0.00638 12139.60 4
## 1384 509447.5 3.97 6224.56 0.00638 2697.23 4
## SP500.Price SP500.Volume Gold.Price Gold.Volume
## 1391 2280.90 3591270000 1193.2 50503
## 1390 2278.87 4087450000 1208.6 3212
## 1389 2279.55 3916610000 1205.6 1145
## 1388 2280.85 3807710000 1216.7 1512
## 1387 2297.42 3597970000 1218.5 865
## 1384 2292.56 3109050000 1230.0 908
# Normalize All the Attributes ( NOT HL.CLOSE)
master_svm$Open = (master_svm$Open-mean(master_svm$Open))/sd(master_svm$Open)
master_svm$High = (master_svm$High-mean(master_svm$High))/sd(master_svm$High)
master_svm$Low = (master_svm$Low-mean(master_svm$Low))/sd(master_svm$Low)
master_svm$Close = (master_svm$Close-mean(master_svm$Close))/sd(master_svm$Close)
master_svm$Volume.BTC = (master_svm$Volume.BTC-mean(master_svm$Volume.BTC))/sd(master_svm$Volume.BTC)
master_svm$Volume.USD = (master_svm$Volume.USD-mean(master_svm$Volume.USD))/sd(master_svm$Volume.USD)
master_svm$ETH.Price = (master_svm$ETH.Price-mean(master_svm$ETH.Price))/sd(master_svm$ETH.Price)
master_svm$ETH.Volume = (master_svm$ETH.Volume-mean(master_svm$ETH.Volume))/sd(master_svm$ETH.Volume)
master_svm$LTC.Price = (master_svm$LTC.Price-mean(master_svm$LTC.Price))/sd(master_svm$LTC.Price)
master_svm$LTC.Volume = (master_svm$LTC.Volume-mean(master_svm$LTC.Volume))/sd(master_svm$LTC.Volume)
master_svm$XRP.Price = (master_svm$XRP.Price-mean(master_svm$XRP.Price))/sd(master_svm$XRP.Price)
master_svm$XRP.Volume = (master_svm$XRP.Volume-mean(master_svm$XRP.Volume))/sd(master_svm$XRP.Volume)
master_svm$Google.Search = (master_svm$Google.Search-mean(master_svm$Google.Search))/sd(master_svm$Google.Search)
master_svm$SP500.Price = (master_svm$SP500.Price-mean(master_svm$SP500.Price))/sd(master_svm$SP500.Price)
master_svm$SP500.Volume = (master_svm$SP500.Volume-mean(master_svm$SP500.Volume))/sd(master_svm$SP500.Volume)
master_svm$Gold.Price = (master_svm$Gold.Price-mean(master_svm$Gold.Price))/sd(master_svm$Gold.Price)
master_svm$Gold.Volume = (master_svm$Gold.Volume-mean(master_svm$Gold.Volume))/sd(master_svm$Gold.Volume)
Make sure that the value that you are trying to predict is a factor
master_svm$HL.Close = factor(master_svm$HL.Close, levels=c(0,1), labels = c("L", "H"))
levels((master_svm$HL.Close))
## [1] "L" "H"
# Create Training and Testing Sets
num_samples = dim(master_svm)[1]
sampling.rate = 0.8
training <- sample(1:num_samples, sampling.rate * num_samples, replace=FALSE)
trainingSet <- master_svm[training, ]
testing <- setdiff(1:num_samples,training)
testingSet <- master_svm[testing, ]
# Load the SVM Library
library(e1071)
##
## Attaching package: 'e1071'
## The following object is masked from 'package:Hmisc':
##
## impute
# Apply a linear SVM with an error cost of 20
svmModel <- svm(HL.Close~., data=trainingSet, kernel="linear", cost=20)
# Perform predictions for the testing set
predictedLabels <-predict(svmModel, testingSet)
predictedLabels
## 1391 1389 1383 1382 1380 1376 1366 1355 1349 1338 1333 1324 1320 1319 1314 1313
## H H H H H H H H H H H H H H H H
## 1305 1291 1289 1286 1275 1270 1269 1255 1251 1247 1241 1226 1221 1213 1212 1207
## H H H H H H H H H H H H H H H H
## 1205 1186 1178 1177 1172 1165 1163 1158 1146 1132 1128 1124 1123 1108 1097 1076
## H H H H H H H H H H H H H H H L
## 1065 1059 1052 1051 1034 1023 1011 1009 1005 990 988 982 976 974 971 941
## L L H L H L H H H H H H H L H L
## 940 929 921 911 900 890 884 870 869 864 858 855 850 849 848 841
## L L H H L H H H H H L H L L L L
## 836 831 820 817 814 813 809 806 802 789 788 782 760 750 746 745
## H L L L L L L L L L L H L L L L
## 704 695 655 653 652 649 648 647 633 619 617 610 607 604 578 575
## H H H H H H H H H H H H H H H H
## 572 565 563 547 529 526 519 509 499 495 491 464 463 460 453 442
## H H H L L L L H L L L L L L L L
## 438 429 410 409 402 401 396 387 382 368 367 355 338 305 285 275
## L L H L H H H L L H H H H H H H
## 269 263 260 254 253 248 243 242 240 233 227 207 205 192 190 184
## H H H H H H H H H H H H H H H H
## 172 164 155 149 143 142 131 130 127 117 113 107 106 103 101 100
## H H H H H H H H H H H H H H H H
## 96 82 80 71 65 64 60 58 50 47 44 31 11 4 3
## H H H H H H H H H H H H L L H
## Levels: L H
#Calculate misclassification rate
# Get the number of data points in the test set
sizeTestSet = dim(testingSet)[1]
# Get the number of data points that are misclassified
error = sum(predictedLabels != testingSet$HL.Close)
# Calculate the misclassification rate
misclassification_rate = error/sizeTestSet
# Display the misclassification rate
print(misclassification_rate)
## [1] 0.513089
# Apply a polynomial SVM with an error cost of 20
svmModel <- svm(HL.Close~., data=trainingSet, kernel="polynomial", cost=20)
Let us now do some predictions on the test set
# Perform prdictions for the testing set
predictedLabels <-predict(svmModel, testingSet)
predictedLabels
## 1391 1389 1383 1382 1380 1376 1366 1355 1349 1338 1333 1324 1320 1319 1314 1313
## H H H H H H H H H H H H H H H H
## 1305 1291 1289 1286 1275 1270 1269 1255 1251 1247 1241 1226 1221 1213 1212 1207
## H H H H H H H L L H H H L H H H
## 1205 1186 1178 1177 1172 1165 1163 1158 1146 1132 1128 1124 1123 1108 1097 1076
## H H H L H H H H H H H H H H H L
## 1065 1059 1052 1051 1034 1023 1011 1009 1005 990 988 982 976 974 971 941
## L L L L H H H H H L L H H H H L
## 940 929 921 911 900 890 884 870 869 864 858 855 850 849 848 841
## H H L L H H H H H H L H L L L H
## 836 831 820 817 814 813 809 806 802 789 788 782 760 750 746 745
## H H H H H H H H H H H H H H H H
## 704 695 655 653 652 649 648 647 633 619 617 610 607 604 578 575
## H H H H H H H H H H H H H H H H
## 572 565 563 547 529 526 519 509 499 495 491 464 463 460 453 442
## H H H H H H H H L L L H L L L L
## 438 429 410 409 402 401 396 387 382 368 367 355 338 305 285 275
## L L L H H H H L L L L H H L L H
## 269 263 260 254 253 248 243 242 240 233 227 207 205 192 190 184
## H L L H H H H H H H H H L H H H
## 172 164 155 149 143 142 131 130 127 117 113 107 106 103 101 100
## H H L H L H H H H L H H H H H H
## 96 82 80 71 65 64 60 58 50 47 44 31 11 4 3
## L L H H H L H H H H H L H H L
## Levels: L H
We compute the misclassification rate (the rate of incorrect predictions).
# Get the number of data points in the test set
sizeTestSet = dim(testingSet)[1]
# Get the number of data points that are misclassified
error = sum(predictedLabels != testingSet$HL.Close)
# Calculate the misclassification rate
misclassification_rate = error/sizeTestSet
# Display the misclassification rate
print(misclassification_rate)
## [1] 0.486911
# Apply a radial SVM with an error cost of 20
svmModel <- svm(HL.Close~., data=trainingSet, kernel="radial", cost=20)
# Perform predictions for the testing set
predictedLabels <-predict(svmModel, testingSet)
# Get the number of data points in the test set
sizeTestSet = dim(testingSet)[1]
# Get the number of data points that are misclassified
error = sum(predictedLabels != testingSet$HL.Close)
# Calculate the misclassification rate
misclassification_rate = error/sizeTestSet
# Display the misclassification rate
print(misclassification_rate)
## [1] 0.4973822
library("anytime")
library("bsts")
## Loading required package: BoomSpikeSlab
## Loading required package: Boom
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked _by_ '.GlobalEnv':
##
## SP500
##
## Attaching package: 'Boom'
## The following object is masked from 'package:stats':
##
## rWishart
##
## Attaching package: 'BoomSpikeSlab'
## The following object is masked from 'package:stats':
##
## knots
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: xts
##
## Attaching package: 'bsts'
## The following object is masked from 'package:BoomSpikeSlab':
##
## SuggestBurn
library("car")
## Loading required package: carData
library("caret")
library("forecast")
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
##
## Attaching package: 'forecast'
## The following object is masked _by_ '.GlobalEnv':
##
## gold
library("tseries")
library("TTR")
master_reg <- read.csv("Coinbase_BTCUSD_d.csv")
num_samples = dim(master_reg)[1]
sampling.rate = 0.996
training <- sample(1:num_samples, sampling.rate * num_samples, replace=FALSE)
trainingSet <- master_reg[training, ]
testing <- setdiff(1:num_samples,training)
testingSet <- master_reg[testing, ]
num_samples = dim(master_reg)[1]
sampling.rate = 0.996
training <- sample(1:num_samples, sampling.rate * num_samples, replace=FALSE)
trainingSet <- master_reg[training, ]
trainingSet = subset(trainingSet, select = -c(Timestamp, Symbol, Volume.BTC))
testing <- setdiff(1:num_samples,training)
testingSet <- master_reg[testing, ]
testingSet = subset(testingSet, select = -c(Timestamp, Symbol,Open, High, Low, Volume.BTC, Volume.USD))
testdata <- testingSet[,2]
trainingSet$Date <- as.Date(anytime(trainingSet$Date))
testingSet$Date <- as.Date(anytime(testingSet$Date))
trainingSet$Volume <- gsub(",", "", trainingSet$Volume.USD)
trainingSet$Volume <- as.numeric(trainingSet$Volume.USD)
trainingSet <- xts(trainingSet[, -1], order.by = as.POSIXct(trainingSet$Date))
trainingSetResult <- ts(trainingSet[,4], frequency = 365,start = 2015)
dects <- decompose(trainingSetResult)
plot(dects)
holt_result <- holt(trainingSet[1000:2000,'Close'], type = "additive", damped = F)
holt_forecast <- forecast(holt_result, h = 9)
holtdf <- as.data.frame(holt_forecast)
holtdf
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 86486401 9453.302 8931.660 9974.945 8655.518 10251.09
## 86572801 9458.091 8720.446 10195.736 8329.960 10586.22
## 86659201 9462.880 8559.451 10366.309 8081.204 10844.56
## 86745601 9467.668 8424.451 10510.885 7872.205 11063.13
## 86832001 9472.457 8306.063 10638.850 7688.612 11256.30
## 86918401 9477.245 8199.473 10755.017 7523.062 11431.43
## 87004801 9482.034 8101.823 10862.245 7371.183 11592.88
## 87091201 9486.823 8011.249 10962.396 7230.127 11743.52
## 87177601 9491.611 7926.457 11056.765 7097.914 11885.31
plot(holtdf, ylim = c(0,20000))
holtfdf <- cbind(testingSet, holtdf[,1])
holtfdf
## Date Close holtdf[, 1]
## 267 2020-02-28 8708.89 9453.302
## 289 2020-02-06 9763.01 9458.091
## 329 2019-12-28 7302.67 9462.880
## 996 2018-03-01 10895.92 9467.668
## 1025 2018-01-31 10099.99 9472.457
## 1552 2016-08-22 585.12 9477.245
## 1567 2016-08-07 593.90 9482.034
## 1772 2016-01-15 373.43 9486.823
## 1948 2015-07-23 276.91 9491.611
accuracy(holtdf[,1], testdata)
## ME RMSE MAE MPE MAPE
## Test set -4072.475 6089.89 4597.075 -977.7898 982.7775
ggplot() + geom_line(data = holtfdf, aes(Date, holtfdf[,2]), color = "blue") + geom_line(data = holtfdf, aes(Date, holtfdf[,3]), color = "Dark Red")
```